Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead

Eric Huang <jinhuieric.huang@xxxxxxx> · Tue, 13 Jul 2021 13:23:11 -0400

I am converting codes into amd-staging-drm-next. Theoretically it will 
improve a lot on the latency, the size of the array allocated is 24 
(PAGE_SIZE/struct drm_mm_node) with this patch, and it was 8192 before. 
So the latency should be reduced by 98 us.

Regards,
Eric

On 2021-07-13 12:11 p.m., Felix Kuehling wrote:
Am 2021-07-13 um 9:32 a.m. schrieb Christian König:
For allocations larger than 48MiB we need more than a page for the
housekeeping in the worst case resulting in the usual vmalloc overhead.

Try to avoid this by assuming the good case and only falling back to the
worst case if this didn't worked.

Signed-off-by: Christian König <christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
  1 file changed, 60 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 2fd77c36a1ff..ab8c5e28df7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
   * @man: TTM memory type manager
   * @tbo: TTM BO we need this range for
   * @place: placement flags and restrictions
- * @mem: the resulting mem object
+ * @num_nodes: number of page nodes to use.
+ * @pages_per_node: number of pages per node to use.
+ * @res: the resulting mem object
   *
   * Allocate VRAM for the given BO.
   */
  static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  			       struct ttm_buffer_object *tbo,
  			       const struct ttm_place *place,
+			       unsigned long num_nodes,
+			       unsigned long pages_per_node,
  			       struct ttm_resource **res)
  {
-	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
  	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  	struct amdgpu_device *adev = to_amdgpu_device(mgr);
  	uint64_t vis_usage = 0, mem_bytes, max_bytes;
+	unsigned long lpfn, pages_left, pages;
  	struct ttm_range_mgr_node *node;
  	struct drm_mm *mm = &mgr->mm;
  	enum drm_mm_insert_mode mode;
@@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  		goto error_sub;
  	}
  
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_node = ~0ul;
-		num_nodes = 1;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_node = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_node = max_t(uint32_t, pages_per_node,
-				       tbo->page_alignment);
-		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
-	}
-
  	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
  			GFP_KERNEL | __GFP_ZERO);
  	if (!node) {
@@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  	i = 0;
  	spin_lock(&mgr->lock);
  	while (pages_left) {
-		uint32_t alignment = tbo->page_alignment;
+		unsigned long alignment = tbo->page_alignment;
+
+		if (i >= num_nodes) {
+			r = -E2BIG;
+			goto error_free;
+		}
  
  		if (pages >= pages_per_node)
-			alignment = pages_per_node;
+			alignment = max(alignment, pages_per_node);
I don't understand this change. Is this an unrelated fix? pages_per_node
is already bumped up to tbo->page_alignment in amdgpu_vram_mgr_alloc. So
this "max" operation here seems redundant.

Other than that, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>

@JinHuiEric, can you confirm the performance improvement?

Thanks,
   Felix


  
  		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
  						alignment, 0, place->fpfn,
@@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  	return r;
  }
  
+/**
+ * amdgpu_vram_mgr_alloc - allocate new range
+ *
+ * @man: TTM memory type manager
+ * @tbo: TTM BO we need this range for
+ * @place: placement flags and restrictions
+ * @res: the resulting mem object
+ *
+ * Allocate VRAM for the given BO.
+ */
+static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
+				 struct ttm_buffer_object *tbo,
+				 const struct ttm_place *place,
+				 struct ttm_resource **res)
+{
+	unsigned long num_nodes, pages_per_node;
+	struct ttm_range_mgr_node *node;
+	int r;
+
+	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pages_per_node = HPAGE_PMD_NR;
+#else
+	/* default to 2MB */
+	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
+#endif
+	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
+	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
+
+	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
+		size_t size = PAGE_SIZE;
+
+		size -= sizeof(struct ttm_range_mgr_node);
+		size /= sizeof(struct drm_mm_node);
+		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
+					res);
+		if (r != -E2BIG)
+			return r;
+	}
+
+	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
+				   res);
+}
+
  /**
   * amdgpu_vram_mgr_del - free ranges
   *
@@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
  }
  
  static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
-	.alloc	= amdgpu_vram_mgr_new,
+	.alloc	= amdgpu_vram_mgr_alloc,
  	.free	= amdgpu_vram_mgr_del,
  	.debug	= amdgpu_vram_mgr_debug
  };

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx