The default ttm_tt_pages_limit is 1/2 of system memory. It is prone to out of memory with such a configuration. Signed-off-by: Lang Yu <Lang.Yu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 3295838e9a1d..c01c6f3ab562 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) int i; int last_valid_bit; - amdgpu_amdkfd_gpuvm_init_mem_limits(); + amdgpu_amdkfd_gpuvm_init_mem_limits(adev); if (adev->kfd.dev) { struct kgd2kfd_shared_resources gpu_resources = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 1de021ebdd46..13284dbd8c58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); #if IS_ENABLED(CONFIG_HSA_AMD) -void amdgpu_amdkfd_gpuvm_init_mem_limits(void); +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, struct amdgpu_vm *vm); @@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo); void amdgpu_amdkfd_reserve_system_mem(uint64_t size); #else static inline -void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev) { } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7eb5afcc4895..a3e623a320b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -60,6 +60,7 @@ static struct { int64_t system_mem_used; int64_t ttm_mem_used; spinlock_t mem_limit_lock; + bool alow_oversubscribe; } kfd_mem_limit; static const char * const domain_bit_to_string[] = { @@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_ad * System (TTM + userptr) memory - 15/16th System RAM * TTM memory - 3/8th System RAM */ -void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev) { struct sysinfo si; uint64_t mem; @@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT; kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT; + kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU); pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n", (kfd_mem_limit.max_system_mem_limit >> 20), (kfd_mem_limit.max_ttm_mem_limit >> 20)); @@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, kfd_mem_limit.max_ttm_mem_limit) || (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) { - ret = -ENOMEM; - goto release; + if (kfd_mem_limit.alow_oversubscribe) { + pr_warn_ratelimited("Memory is getting oversubscried.\n"); + } else { + ret = -ENOMEM; + goto release; + } } /* Update memory accounting by decreasing available system -- 2.25.1