[Public] Thanks Christian. I've already merged based on Felix's review. I'll send your suggested cleanup for review out soon. Jon > -----Original Message----- > From: Koenig, Christian <Christian.Koenig@xxxxxxx> > Sent: January 12, 2022 2:33 AM > To: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Subject: Re: [PATCH] drm/amdgpu: improve debug VRAM access > performance using sdma > > Am 04.01.22 um 20:12 schrieb Jonathan Kim: > > For better performance during VRAM access for debugged processes, do > > read/write copies over SDMA. > > > > In order to fulfill post mortem debugging on a broken device, fallback > > to stable MMIO access when gpu recovery is disabled or when job > > submission time outs are set to max. Failed SDMA access should > > automatically fall back to MMIO access. > > > > Use a pre-allocated GTT bounce buffer pre-mapped into GART to avoid > > page-table updates and TLB flushes on access. > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 78 > +++++++++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +- > > 2 files changed, 82 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > index 367abed1d6e6..512df4c09772 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > @@ -48,6 +48,7 @@ > > #include <drm/ttm/ttm_range_manager.h> > > > > #include <drm/amdgpu_drm.h> > > +#include <drm/drm_drv.h> > > > > #include "amdgpu.h" > > #include "amdgpu_object.h" > > @@ -1429,6 +1430,70 @@ static void > amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos, > > } > > } > > > > +static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object > *bo, > > + unsigned long offset, void *buf, int > len, int write) { > > + struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo); > > + struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev); > > + struct amdgpu_job *job; > > + struct dma_fence *fence; > > + uint64_t src_addr, dst_addr; > > + unsigned int num_dw; > > + int r, idx; > > + > > + if (len != PAGE_SIZE) > > + return -EINVAL; > > + > > + if (!adev->mman.sdma_access_ptr) > > + return -EACCES; > > + > > + r = drm_dev_enter(adev_to_drm(adev), &idx); > > + if (r) > > + return r; > > + > > + if (write) > > + memcpy(adev->mman.sdma_access_ptr, buf, len); > > + > > + num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); > > + r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, > AMDGPU_IB_POOL_DELAYED, &job); > > + if (r) > > + goto out; > > + > > + src_addr = write ? amdgpu_bo_gpu_offset(adev- > >mman.sdma_access_bo) : > > + amdgpu_bo_gpu_offset(abo); > > + dst_addr = write ? amdgpu_bo_gpu_offset(abo) : > > + amdgpu_bo_gpu_offset(adev- > >mman.sdma_access_bo); > > I suggest to write this as > > src_addr = a; > dst_addr = b; > if (write) > swap(src_addr, dst_addr); > > This way we are not duplicating getting the different offsets. > > > + amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr, > > +PAGE_SIZE, false); > > + > > + amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job- > >ibs[0]); > > + WARN_ON(job->ibs[0].length_dw > num_dw); > > + > > + r = amdgpu_job_submit(job, &adev->mman.entity, > AMDGPU_FENCE_OWNER_UNDEFINED, &fence); > > + if (r) { > > + amdgpu_job_free(job); > > + goto out; > > + } > > + > > + if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout)) > > + r = -ETIMEDOUT; > > + dma_fence_put(fence); > > + > > + if (!(r || write)) > > + memcpy(buf, adev->mman.sdma_access_ptr, len); > > +out: > > + drm_dev_exit(idx); > > + return r; > > +} > > + > > +static inline bool amdgpu_ttm_allow_post_mortem_debug(struct > > +amdgpu_device *adev) { > > + return amdgpu_gpu_recovery == 0 || > > + adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || > > + adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || > > + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || > > + adev->video_timeout == MAX_SCHEDULE_TIMEOUT; } > > This should probably be inside amdgpu_device.c > > > + > > /** > > * amdgpu_ttm_access_memory - Read or Write memory that backs a > buffer object. > > * > > @@ -1453,6 +1518,10 @@ static int amdgpu_ttm_access_memory(struct > ttm_buffer_object *bo, > > if (bo->resource->mem_type != TTM_PL_VRAM) > > return -EIO; > > > > + if (!amdgpu_ttm_allow_post_mortem_debug(adev) && > > + !amdgpu_ttm_access_memory_sdma(bo, offset, > buf, len, write)) > > + return len; > > + > > amdgpu_res_first(bo->resource, offset, len, &cursor); > > while (cursor.remaining) { > > size_t count, size = cursor.size; > > @@ -1793,6 +1862,12 @@ int amdgpu_ttm_init(struct amdgpu_device > *adev) > > return r; > > } > > > > + if (amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, > > + AMDGPU_GEM_DOMAIN_GTT, > > + &adev->mman.sdma_access_bo, NULL, > > + adev->mman.sdma_access_ptr)) > > + DRM_WARN("Debug VRAM access will use slowpath MM > access\n"); > > + > > return 0; > > } > > > > @@ -1823,6 +1898,9 @@ void amdgpu_ttm_fini(struct amdgpu_device > *adev) > > ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_OA); > > ttm_device_fini(&adev->mman.bdev); > > adev->mman.initialized = false; > > + if (adev->mman.sdma_access_ptr) > > You can drop that if. Free functions can usually take a NULL pointer. > > Apart from those nit picks looks good to me as well. > > Regards, > Christian. > > > + amdgpu_bo_free_kernel(&adev->mman.sdma_access_bo, > NULL, > > + &adev->mman.sdma_access_ptr); > > DRM_INFO("amdgpu: ttm finalized\n"); > > } > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > > index 91a087f9dc7c..b0116c4a768f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > > @@ -98,6 +98,10 @@ struct amdgpu_mman { > > u64 fw_vram_usage_size; > > struct amdgpu_bo *fw_vram_usage_reserved_bo; > > void *fw_vram_usage_va; > > + > > + /* PAGE_SIZE'd BO for process memory r/w over SDMA. */ > > + struct amdgpu_bo *sdma_access_bo; > > + void *sdma_access_ptr; > > }; > > > > struct amdgpu_copy_mem { > > @@ -193,5 +197,4 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct > amdgpu_device *adev, struct ttm_tt *ttm, > > int amdgpu_ttm_evict_resources(struct amdgpu_device *adev, int > > mem_type); > > > > void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev); > > - > > #endif