Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2024-04-16 02:50, Paneer Selvam, Arunpravin wrote:


On 4/16/2024 3:32 AM, Philip Yang wrote:


On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.
This change will simplify the KFD best effort contiguous VRAM allocation, because KFD doesn't need set new GEM_ flag.
When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs and F/W size related BOs, do all allocation happen at driver initialization before the VRAM is fragmented?

- we will try to allocate the contiguous buffer. Say if the
   allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
   and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
   the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam<Arunpravin.PaneerSelvam@xxxxxxx>
Suggested-by: Christian König<christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
  2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
          else
              places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
          c++;
      }
  @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
  {
      struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
      struct ttm_operation_ctx ctx = { false, false };
+    struct ttm_place *places = bo->placements;
+    u32 c = 0;
      int r, i;
        if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
        if (bo->tbo.pin_count) {
          uint32_t mem_type = bo->tbo.resource->mem_type;
-        uint32_t mem_flags = bo->tbo.resource->placement;
            if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
              return -EINVAL;
  -        if ((mem_type == TTM_PL_VRAM) &&
-            (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-            !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-            return -EINVAL;
-
This looks like a bug before, but with this patch, the check makes sense and is needed.
          ttm_bo_pin(&bo->tbo);
            if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
              bo->placements[i].lpfn = lpfn;
      }
  +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+        !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+        places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+

If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, should pin and return scattered pages because the RDMA support scattered dmabuf. Christian also pointed this out.

If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&

    bo->placements[i].mem_type == TTM_PL_VRAM)
        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
      r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
      if (unlikely(r)) {
          dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
      return size;
  }
  +static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+                 const struct ttm_place *place,
+                 unsigned long bo_flags)
+{
+    unsigned long pages_per_block;
+
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+        place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+        pages_per_block = ~0ul;
+    } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        pages_per_block = HPAGE_PMD_NR;
+#else
+        /* default to 2MB */
+        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+        pages_per_block = max_t(uint32_t, pages_per_block,
+                    tbo->page_alignment);
+    }
+
+    return pages_per_block;
+}
+
  /**
   * DOC: mem_info_vram_total
   *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
      struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
      struct amdgpu_device *adev = to_amdgpu_device(mgr);
      u64 vis_usage = 0, max_bytes, min_block_size;
+    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
      struct amdgpu_vram_mgr_resource *vres;
      u64 size, remaining_size, lpfn, fpfn;
+    unsigned long bo_flags = bo->flags;
      struct drm_buddy *mm = &mgr->mm;
      struct drm_buddy_block *block;
      unsigned long pages_per_block;
@@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
      if (tbo->type != ttm_bo_type_kernel)
          max_bytes -= AMDGPU_VM_RESERVED_VRAM;
  -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-        pages_per_block = ~0ul;
-    } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        pages_per_block = HPAGE_PMD_NR;
-#else
-        /* default to 2MB */
-        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
-#endif
-        pages_per_block = max_t(uint32_t, pages_per_block,
-                    tbo->page_alignment);
-    }
+    pages_per_block =
+        amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
        vres = kzalloc(sizeof(*vres), GFP_KERNEL);
      if (!vres)
@@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
      if (place->flags & TTM_PL_FLAG_TOPDOWN)
          vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
  -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+        place->flags & TTM_PL_FLAG_CONTIGUOUS)
          vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
        if (fpfn || lpfn != mgr->mm.size)
@@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
                         min_block_size,
                         &vres->blocks,
                         vres->flags);
-        if (unlikely(r))
+        if (unlikely(r)) {

If pin BO failed to allocate contiguous VRAM, this should return failure to application, as RDMA cannot work with scattered VRAM pages.

here we can return -ENOSPC when the BO is pinned (i.e TTM_PL_FLAG_CONTIGUOUS is set). Please find the below modified condition,
if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
    !(place->flags & TTM_PL_FLAG_CONTIGUOUS)

Yes, this change looks good for me.

Thanks

Philip

Hence if the TTM_PL_FLAG_CONTIGUOUS flag is set, we don't proceed for allocating individual pages.

Regards,

Philip

+            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+                /* Fallback to non contiguous allocation */
+                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
+                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+
+                pages_per_block =
+                    amdgpu_vram_find_pages_per_block(tbo,
+                                     place,
+                                     bo_flags);
+                continue;
+            }
              goto error_free_blocks;
+        }
            if (size > remaining_size)
              remaining_size = 0;


[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux