Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

"Paneer Selvam, Arunpravin" <arunpravin.paneerselvam@xxxxxxx> · Tue, 16 Apr 2024 12:20:05 +0530

On 4/16/2024 3:32 AM, Philip Yang wrote:


On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.
This change will simplify the KFD best effort contiguous VRAM 
allocation, because KFD doesn't need set new GEM_ flag.
When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs 
and F/W size related BOs, do all allocation happen at driver 
initialization before the VRAM is fragmented?

- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam<Arunpravin.PaneerSelvam@xxxxxxx>
Suggested-by: Christian König<christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
  2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
  		else
  			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
  		c++;
  	}
  
@@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
  {
  	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
  	struct ttm_operation_ctx ctx = { false, false };
+	struct ttm_place *places = bo->placements;
+	u32 c = 0;
  	int r, i;
  
  	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
  
  	if (bo->tbo.pin_count) {
  		uint32_t mem_type = bo->tbo.resource->mem_type;
-		uint32_t mem_flags = bo->tbo.resource->placement;
  
  		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
  			return -EINVAL;
  
-		if ((mem_type == TTM_PL_VRAM) &&
-		    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-		    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-			return -EINVAL;
-
This looks like a bug before, but with this patch, the check makes 
sense and is needed.
  		ttm_bo_pin(&bo->tbo);
  
  		if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
  			bo->placements[i].lpfn = lpfn;
  	}
  
+	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+	    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+

If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
should pin and return scattered pages because the RDMA support 
scattered dmabuf. Christian also pointed this out.

If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&

    bo->placements[i].mem_type == TTM_PL_VRAM)
        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
  	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
  	if (unlikely(r)) {
  		dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
  	return size;
  }
  
+static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+				 const struct ttm_place *place,
+				 unsigned long bo_flags)
+{
+	unsigned long pages_per_block;
+
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+	    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+		pages_per_block = ~0ul;
+	} else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		pages_per_block = HPAGE_PMD_NR;
+#else
+		/* default to 2MB */
+		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+		pages_per_block = max_t(uint32_t, pages_per_block,
+					tbo->page_alignment);
+	}
+
+	return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
  	struct amdgpu_device *adev = to_amdgpu_device(mgr);
  	u64 vis_usage = 0, max_bytes, min_block_size;
+	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
  	struct amdgpu_vram_mgr_resource *vres;
  	u64 size, remaining_size, lpfn, fpfn;
+	unsigned long bo_flags = bo->flags;
  	struct drm_buddy *mm = &mgr->mm;
  	struct drm_buddy_block *block;
  	unsigned long pages_per_block;
@@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  	if (tbo->type != ttm_bo_type_kernel)
  		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
  
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_block = ~0ul;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_block = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_block = max_t(uint32_t, pages_per_block,
-					tbo->page_alignment);
-	}
+	pages_per_block =
+		amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
  
  	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
  	if (!vres)
@@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  	if (place->flags & TTM_PL_FLAG_TOPDOWN)
  		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
  
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+	    place->flags & TTM_PL_FLAG_CONTIGUOUS)
  		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
  
  	if (fpfn || lpfn != mgr->mm.size)
@@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
  					   min_block_size,
  					   &vres->blocks,
  					   vres->flags);
-		if (unlikely(r))
+		if (unlikely(r)) {

If pin BO failed to allocate contiguous VRAM, this should return 
failure to application, as RDMA cannot work with scattered VRAM pages.

here we can return -ENOSPC when the BO is pinned (i.e 
TTM_PL_FLAG_CONTIGUOUS is set). Please find the below modified condition,
if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
    !(place->flags & TTM_PL_FLAG_CONTIGUOUS)
Hence if the TTM_PL_FLAG_CONTIGUOUS flag is set, we don't proceed for 
allocating individual pages.

Regards,

Philip

+			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+				/* Fallback to non contiguous allocation */
+				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
+				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+
+				pages_per_block =
+					amdgpu_vram_find_pages_per_block(tbo,
+									 place,
+									 bo_flags);
+				continue;
+			}
  			goto error_free_blocks;
+		}
  
  		if (size > remaining_size)
  			remaining_size = 0;