Re: [PATCH] drm/amdgpu: improve debug VRAM access performance using sdma

Christian König <christian.koenig@xxxxxxx> · Mon, 20 Mar 2023 18:31:35 +0100

Ah, yes! GART doesn't need to be read to make a GTT allocation.

When GART becomes ready it will be filled with all the buffers which 
were allocated before it was ready.

So this is perfectly fine.

Thanks,
Christian.

Am 20.03.23 um 18:24 schrieb Kim, Jonathan:
[Public]

This was a long time ago but I think we agreed allocation was ok before GART was ready.
IIRC, there was also some mentioned related scenario where APUs needed to work without VRAM but allocations were required (but I don't know the details regarding that).
I vaguely remember the requirement for GART readiness for the bounce buffer allocation caused some problems elsewhere.
Are there problems observed with the bounce buffer being allocated without GART readiness?

Thanks,

Jon
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx>
Sent: Monday, March 20, 2023 1:02 PM
To: Quan, Evan <Evan.Quan@xxxxxxx>; Kim, Jonathan
<Jonathan.Kim@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Koenig, Christian
<Christian.Koenig@xxxxxxx>
Subject: Re: [PATCH] drm/amdgpu: improve debug VRAM access performance
using sdma

Caution: This message originated from an External Source. Use proper
caution when opening attachments, clicking links, or responding.


I don't think so. Have we recently re-ordered something here?

Christian.

Am 20.03.23 um 08:05 schrieb Quan, Evan:
[AMD Official Use Only - General]

I happened to find the sdma_access_bo allocation from GTT seems
performing before gart is ready.
That makes the "amdgpu_gart_map" is skipped since adev->gart.ptr is still
NULL.
Is that done intentionally ?

Evan
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
Jonathan Kim
Sent: Wednesday, January 5, 2022 3:12 AM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Kim, Jonathan
<Jonathan.Kim@xxxxxxx>; Koenig, Christian
<Christian.Koenig@xxxxxxx>
Subject: [PATCH] drm/amdgpu: improve debug VRAM access performance
using sdma

For better performance during VRAM access for debugged processes, do
read/write copies over SDMA.

In order to fulfill post mortem debugging on a broken device, fallback to
stable MMIO access when gpu recovery is disabled or when job
submission
time outs are set to max.  Failed SDMA access should automatically fall
back to MMIO access.

Use a pre-allocated GTT bounce buffer pre-mapped into GART to avoid
page-table updates and TLB flushes on access.

Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 78
+++++++++++++++++++++++++
   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  5 +-
   2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 367abed1d6e6..512df4c09772 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -48,6 +48,7 @@
   #include <drm/ttm/ttm_range_manager.h>

   #include <drm/amdgpu_drm.h>
+#include <drm/drm_drv.h>

   #include "amdgpu.h"
   #include "amdgpu_object.h"
@@ -1429,6 +1430,70 @@ static void
amdgpu_ttm_vram_mm_access(struct
amdgpu_device *adev, loff_t pos,
      }
   }

+static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object
*bo,
+                                    unsigned long offset, void *buf, int
len, int write)
+{
+    struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
+    struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
+    struct amdgpu_job *job;
+    struct dma_fence *fence;
+    uint64_t src_addr, dst_addr;
+    unsigned int num_dw;
+    int r, idx;
+
+    if (len != PAGE_SIZE)
+            return -EINVAL;
+
+    if (!adev->mman.sdma_access_ptr)
+            return -EACCES;
+
+    r = drm_dev_enter(adev_to_drm(adev), &idx);
+    if (r)
+            return r;
+
+    if (write)
+            memcpy(adev->mman.sdma_access_ptr, buf, len);
+
+    num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
+    r = amdgpu_job_alloc_with_ib(adev, num_dw * 4,
AMDGPU_IB_POOL_DELAYED, &job);
+    if (r)
+            goto out;
+
+    src_addr = write ? amdgpu_bo_gpu_offset(adev-
mman.sdma_access_bo) :
+                    amdgpu_bo_gpu_offset(abo);
+    dst_addr = write ? amdgpu_bo_gpu_offset(abo) :
+                    amdgpu_bo_gpu_offset(adev-
mman.sdma_access_bo);
+    amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
PAGE_SIZE, false);
+
+    amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);
+    WARN_ON(job->ibs[0].length_dw > num_dw);
+
+    r = amdgpu_job_submit(job, &adev->mman.entity,
AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
+    if (r) {
+            amdgpu_job_free(job);
+            goto out;
+    }
+
+    if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout))
+            r = -ETIMEDOUT;
+    dma_fence_put(fence);
+
+    if (!(r || write))
+            memcpy(buf, adev->mman.sdma_access_ptr, len);
+out:
+    drm_dev_exit(idx);
+    return r;
+}
+
+static inline bool amdgpu_ttm_allow_post_mortem_debug(struct
amdgpu_device *adev)
+{
+    return amdgpu_gpu_recovery == 0 ||
+            adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
+            adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
+            adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+            adev->video_timeout == MAX_SCHEDULE_TIMEOUT;
+}
+
   /**
    * amdgpu_ttm_access_memory - Read or Write memory that backs a
buffer object.
    *
@@ -1453,6 +1518,10 @@ static int amdgpu_ttm_access_memory(struct
ttm_buffer_object *bo,
      if (bo->resource->mem_type != TTM_PL_VRAM)
              return -EIO;

+    if (!amdgpu_ttm_allow_post_mortem_debug(adev) &&
+                    !amdgpu_ttm_access_memory_sdma(bo, offset, buf,
len, write))
+            return len;
+
      amdgpu_res_first(bo->resource, offset, len, &cursor);
      while (cursor.remaining) {
              size_t count, size = cursor.size;
@@ -1793,6 +1862,12 @@ int amdgpu_ttm_init(struct amdgpu_device
*adev)
              return r;
      }

+    if (amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
+                            AMDGPU_GEM_DOMAIN_GTT,
+                            &adev->mman.sdma_access_bo, NULL,
+                            adev->mman.sdma_access_ptr))
+            DRM_WARN("Debug VRAM access will use slowpath MM
access\n");
+
      return 0;
   }

@@ -1823,6 +1898,9 @@ void amdgpu_ttm_fini(struct amdgpu_device
*adev)
      ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_OA);
      ttm_device_fini(&adev->mman.bdev);
      adev->mman.initialized = false;
+    if (adev->mman.sdma_access_ptr)
+            amdgpu_bo_free_kernel(&adev->mman.sdma_access_bo,
NULL,
+                                    &adev->mman.sdma_access_ptr);
      DRM_INFO("amdgpu: ttm finalized\n");
   }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 91a087f9dc7c..b0116c4a768f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -98,6 +98,10 @@ struct amdgpu_mman {
      u64             fw_vram_usage_size;
      struct amdgpu_bo        *fw_vram_usage_reserved_bo;
      void            *fw_vram_usage_va;
+
+    /* PAGE_SIZE'd BO for process memory r/w over SDMA. */
+    struct amdgpu_bo        *sdma_access_bo;
+    void                    *sdma_access_ptr;
   };

   struct amdgpu_copy_mem {
@@ -193,5 +197,4 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct
amdgpu_device *adev, struct ttm_tt *ttm,
   int amdgpu_ttm_evict_resources(struct amdgpu_device *adev, int
mem_type);

   void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);
-
   #endif
--
2.25.1