Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2024-04-26 04:37, Lang Yu wrote:
Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist.

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.

Signed-off-by: Lang Yu <Lang.Yu@xxxxxxx>

Two nit-picks inline. Other than that, this patch looks reasonable to me.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 +++++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +++++++++++--------
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c      |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  6 ++++--
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h          |  3 ++-
  5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7ba05f030dd1..3295838e9a1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
  			mem_info->local_mem_size_private =
  					KFD_XCP_MEMORY_SIZE(adev, xcp->id);
  	} else {
-		mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
+		mem_info->local_mem_size_public = adev->flags & AMD_IS_APU ?
+						  (ttm_tt_pages_limit() << PAGE_SHIFT) :
+						  adev->gmc.visible_vram_size;
  		mem_info->local_mem_size_private = adev->gmc.real_vram_size -
  						adev->gmc.visible_vram_size;

On an APU the private size should be reported as 0.


  	}
@@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)
  		}
  		do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
  		return ALIGN_DOWN(tmp, PAGE_SIZE);
+	} else if (adev->flags & AMD_IS_APU) {
+		return (ttm_tt_pages_limit() << PAGE_SHIFT);
  	} else {
  		return adev->gmc.real_vram_size;
  	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c4f9960dafbb..7eb5afcc4895 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
  			return -EINVAL;
vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
-		if (adev->gmc.is_app_apu) {
+		if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
  			system_mem_needed = size;
  			ttm_mem_needed = size;
  		}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
  		  "adev reference can't be null when vram is used");
  	if (adev && xcp_id >= 0) {
  		adev->kfd.vram_used[xcp_id] += vram_needed;
-		adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+		adev->kfd.vram_used_aligned[xcp_id] +=
+				(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
  				vram_needed :
  				ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
  	}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
if (adev) {
  			adev->kfd.vram_used[xcp_id] -= size;
-			if (adev->gmc.is_app_apu) {
+			if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
  				adev->kfd.vram_used_aligned[xcp_id] -= size;
  				kfd_mem_limit.system_mem_used -= size;
  				kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
  	 * if peer device has large BAR. In contrast, access over xGMI is
  	 * allowed for both small and large BAR configurations of peer device
  	 */
-	if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+	if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) &&
  	    ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
  	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
  	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
  		- atomic64_read(&adev->vram_pin_size)
  		- reserved_for_pt;
- if (adev->gmc.is_app_apu) {
+	if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
  		system_mem_available = no_system_mem_limit ?
  					kfd_mem_limit.max_system_mem_limit :
  					kfd_mem_limit.max_system_mem_limit -
@@ -1669,6 +1670,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
  		available = min3(system_mem_available, ttm_mem_available,
  				 vram_available);
  		available = ALIGN_DOWN(available, PAGE_SIZE);
+

Unnecessary whitespace change.

Regards,
  Felix


  	} else {
  		available = ALIGN_DOWN(vram_available, VRAM_AVAILABLITY_ALIGN);
  	}
@@ -1705,7 +1707,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
  	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
  		domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
- if (adev->gmc.is_app_apu) {
+		if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
  			domain = AMDGPU_GEM_DOMAIN_GTT;
  			alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
  			alloc_flags = 0;
@@ -1952,7 +1954,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
  	if (size) {
  		if (!is_imported &&
  		   (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
-		   (adev->gmc.is_app_apu &&
+		   ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
  		    mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
  			*size = bo_size;
  		else
@@ -2374,8 +2376,9 @@ static int import_obj_create(struct amdgpu_device *adev,
  	(*mem)->dmabuf = dma_buf;
  	(*mem)->bo = bo;
  	(*mem)->va = va;
-	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ?
-		AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
+	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&
+			 !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
+			 AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
(*mem)->mapped_to_gpu_memory = 0;
  	(*mem)->process_info = avm->process_info;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 4bcfbeac48fb..4816fcb9803a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
  	if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
  		return -EINVAL;
- if (adev->gmc.is_app_apu)
+	if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
  		return 0;
pgmap = &kfddev->pgmap;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index bfab16b43fec..238ac11bb97d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2619,7 +2619,8 @@ svm_range_best_restore_location(struct svm_range *prange,
  		return -1;
  	}
- if (node->adev->gmc.is_app_apu)
+	if (node->adev->gmc.is_app_apu ||
+	    node->adev->flags & AMD_IS_APU)
  		return 0;
if (prange->preferred_loc == gpuid ||
@@ -3337,7 +3338,8 @@ svm_range_best_prefetch_location(struct svm_range *prange)
  		goto out;
  	}
- if (bo_node->adev->gmc.is_app_apu) {
+	if (bo_node->adev->gmc.is_app_apu ||
+	    bo_node->adev->flags & AMD_IS_APU) {
  		best_loc = 0;
  		goto out;
  	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 026863a0abcd..9c37bd0567ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -201,7 +201,8 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
   * is initialized to not 0 when page migration register device memory.
   */
  #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\
-					(adev)->gmc.is_app_apu)
+					(adev)->gmc.is_app_apu ||\
+					((adev)->flags & AMD_IS_APU))
void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux