RDMA device with limited scatter-gather capability requires physical address contiguous VRAM buffer for RDMA peer direct access. Add a new KFD alloc memory flag and store as new GEM bo alloc flag. When pin this buffer object to export for RDMA peerdirect access, set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag, and then vram_mgr will set TTM_PL_FLAG_CONTIFUOUS flag to ask VRAM buddy allocator to get contiguous VRAM. Remove the 2GB max memory block size limit for contiguous allocation. Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++++++-- include/uapi/drm/amdgpu_drm.h | 5 +++++ include/uapi/linux/kfd_ioctl.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..3523b91f8add 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,6 +1470,9 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) + bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); @@ -1712,6 +1715,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..1d6e45e238e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -516,8 +516,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations + * for no contiguous allocation. + */ + size = min(remaining_size, 2ULL << 30); if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ad21c613fec8..13645abb8e46 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -171,6 +171,11 @@ extern "C" { * may override the MTYPE selected in AMDGPU_VA_OP_MAP. */ #define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Flag that allocating the BO with best effort for contiguous VRAM. + * If no contiguous VRAM, fallback to scattered allocation. + * Pin the BO for peerdirect RDMA trigger VRAM defragmentation. + */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT (1 << 16) struct drm_amdgpu_gem_create_in { /** the requested memory size */ diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2