RE: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[Public]

>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
>Sent: Saturday, April 27, 2024 6:52 AM
>To: Yu, Lang <Lang.Yu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
>Cc: Yang, Philip <Philip.Yang@xxxxxxx>; Koenig, Christian
><Christian.Koenig@xxxxxxx>; Zhang, Yifan <Yifan1.Zhang@xxxxxxx>; Liu,
>Aaron <Aaron.Liu@xxxxxxx>
>Subject: Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT
>domain on small APUs
>
>
>On 2024-04-26 04:37, Lang Yu wrote:
>> Small APUs(i.e., consumer, embedded products) usually have a small
>> carveout device memory which can't satisfy most compute workloads
>> memory allocation requirements.
>>
>> We can't even run a Basic MNIST Example with a default 512MB carveout.
>> https://github.com/pytorch/examples/tree/main/mnist.
>>
>> Though we can change BIOS settings to enlarge carveout size, which is
>> inflexible and may bring complaint. On the other hand, the memory
>> resource can't be effectively used between host and device.
>>
>> The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
>>
>> Signed-off-by: Lang Yu <Lang.Yu@xxxxxxx>
>
>Two nit-picks inline. Other than that, this patch looks reasonable to me.

Thanks. Will update them accordingly.

Regards,
Lang

>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 +++++-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +++++++++++-
>-------
>>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c      |  2 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  6 ++++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h          |  3 ++-
>>   5 files changed, 24 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 7ba05f030dd1..3295838e9a1d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct
>amdgpu_device *adev,
>>                      mem_info->local_mem_size_private =
>>                                      KFD_XCP_MEMORY_SIZE(adev, xcp-
>>id);
>>      } else {
>> -            mem_info->local_mem_size_public = adev-
>>gmc.visible_vram_size;
>> +            mem_info->local_mem_size_public = adev->flags &
>AMD_IS_APU ?
>> +                                              (ttm_tt_pages_limit() <<
>PAGE_SHIFT) :
>> +                                              adev-
>>gmc.visible_vram_size;
>>              mem_info->local_mem_size_private = adev-
>>gmc.real_vram_size -
>>                                              adev->gmc.visible_vram_size;
>
>On an APU the private size should be reported as 0.
>
>
>>      }
>> @@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct
>amdgpu_device *adev, int xcp_id)
>>              }
>>              do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
>>              return ALIGN_DOWN(tmp, PAGE_SIZE);
>> +    } else if (adev->flags & AMD_IS_APU) {
>> +            return (ttm_tt_pages_limit() << PAGE_SHIFT);
>>      } else {
>>              return adev->gmc.real_vram_size;
>>      }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index c4f9960dafbb..7eb5afcc4895 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>>                      return -EINVAL;
>>
>>              vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
>> -            if (adev->gmc.is_app_apu) {
>> +            if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>>                      system_mem_needed = size;
>>                      ttm_mem_needed = size;
>>              }
>> @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
>amdgpu_device *adev,
>>                "adev reference can't be null when vram is used");
>>      if (adev && xcp_id >= 0) {
>>              adev->kfd.vram_used[xcp_id] += vram_needed;
>> -            adev->kfd.vram_used_aligned[xcp_id] += adev-
>>gmc.is_app_apu ?
>> +            adev->kfd.vram_used_aligned[xcp_id] +=
>> +                            (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>>                              vram_needed :
>>                              ALIGN(vram_needed,
>VRAM_AVAILABLITY_ALIGN);
>>      }
>> @@ -260,7 +261,7 @@ void
>amdgpu_amdkfd_unreserve_mem_limit(struct
>> amdgpu_device *adev,
>>
>>              if (adev) {
>>                      adev->kfd.vram_used[xcp_id] -= size;
>> -                    if (adev->gmc.is_app_apu) {
>> +                    if (adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) {
>>                              adev->kfd.vram_used_aligned[xcp_id] -= size;
>>                              kfd_mem_limit.system_mem_used -= size;
>>                              kfd_mem_limit.ttm_mem_used -= size; @@ -
>889,7 +890,7 @@ static
>> int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
>>       * if peer device has large BAR. In contrast, access over xGMI is
>>       * allowed for both small and large BAR configurations of peer device
>>       */
>> -    if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
>> +    if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags &
>> +AMD_IS_APU)) &&
>>          ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
>>           (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
>>           (mem->alloc_flags &
>KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@
>> -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct
>amdgpu_device *adev,
>>              - atomic64_read(&adev->vram_pin_size)
>>              - reserved_for_pt;
>>
>> -    if (adev->gmc.is_app_apu) {
>> +    if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>>              system_mem_available = no_system_mem_limit ?
>>
>       kfd_mem_limit.max_system_mem_limit :
>>
>       kfd_mem_limit.max_system_mem_limit - @@ -1669,6 +1670,7 @@
>> size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device
>*adev,
>>              available = min3(system_mem_available, ttm_mem_available,
>>                               vram_available);
>>              available = ALIGN_DOWN(available, PAGE_SIZE);
>> +
>
>Unnecessary whitespace change.
>
>Regards,
>   Felix
>
>
>>      } else {
>>              available = ALIGN_DOWN(vram_available,
>VRAM_AVAILABLITY_ALIGN);
>>      }
>> @@ -1705,7 +1707,7 @@ int
>amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>      if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
>>              domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
>>
>> -            if (adev->gmc.is_app_apu) {
>> +            if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>>                      domain = AMDGPU_GEM_DOMAIN_GTT;
>>                      alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
>>                      alloc_flags = 0;
>> @@ -1952,7 +1954,7 @@ int
>amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>      if (size) {
>>              if (!is_imported &&
>>                 (mem->bo->preferred_domains ==
>AMDGPU_GEM_DOMAIN_VRAM ||
>> -               (adev->gmc.is_app_apu &&
>> +               ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
>>                  mem->bo->preferred_domains ==
>AMDGPU_GEM_DOMAIN_GTT)))
>>                      *size = bo_size;
>>              else
>> @@ -2374,8 +2376,9 @@ static int import_obj_create(struct
>amdgpu_device *adev,
>>      (*mem)->dmabuf = dma_buf;
>>      (*mem)->bo = bo;
>>      (*mem)->va = va;
>> -    (*mem)->domain = (bo->preferred_domains &
>AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ?
>> -            AMDGPU_GEM_DOMAIN_VRAM :
>AMDGPU_GEM_DOMAIN_GTT;
>> +    (*mem)->domain = (bo->preferred_domains &
>AMDGPU_GEM_DOMAIN_VRAM) &&
>> +                     !(adev->gmc.is_app_apu || adev->flags &
>AMD_IS_APU) ?
>> +                     AMDGPU_GEM_DOMAIN_VRAM :
>AMDGPU_GEM_DOMAIN_GTT;
>>
>>      (*mem)->mapped_to_gpu_memory = 0;
>>      (*mem)->process_info = avm->process_info; diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> index 4bcfbeac48fb..4816fcb9803a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> @@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct
>amdgpu_device *adev)
>>      if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
>>              return -EINVAL;
>>
>> -    if (adev->gmc.is_app_apu)
>> +    if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
>>              return 0;
>>
>>      pgmap = &kfddev->pgmap;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index bfab16b43fec..238ac11bb97d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -2619,7 +2619,8 @@ svm_range_best_restore_location(struct
>svm_range *prange,
>>              return -1;
>>      }
>>
>> -    if (node->adev->gmc.is_app_apu)
>> +    if (node->adev->gmc.is_app_apu ||
>> +        node->adev->flags & AMD_IS_APU)
>>              return 0;
>>
>>      if (prange->preferred_loc == gpuid || @@ -3337,7 +3338,8 @@
>> svm_range_best_prefetch_location(struct svm_range *prange)
>>              goto out;
>>      }
>>
>> -    if (bo_node->adev->gmc.is_app_apu) {
>> +    if (bo_node->adev->gmc.is_app_apu ||
>> +        bo_node->adev->flags & AMD_IS_APU) {
>>              best_loc = 0;
>>              goto out;
>>      }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> index 026863a0abcd..9c37bd0567ef 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> @@ -201,7 +201,8 @@ void svm_range_list_lock_and_flush_work(struct
>svm_range_list *svms, struct mm_s
>>    * is initialized to not 0 when page migration register device memory.
>>    */
>>   #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0
>||\
>> -                                    (adev)->gmc.is_app_apu)
>> +                                    (adev)->gmc.is_app_apu ||\
>> +                                    ((adev)->flags & AMD_IS_APU))
>>
>>   void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
>>




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux