[AMD Official Use Only - AMD Internal Distribution Only] Thanks Tao. Yes, I added the comments to amdgpu_ras.h. +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) Regards, Hawking -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Tuesday, May 28, 2024 14:02 To: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx> Subject: RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity v2 [AMD Official Use Only - AMD Internal Distribution Only] I prefer to add comment for AMDGPU_RAS_RESERVED_VRAM_SIZE to explain the value of 16MB, anyway the patch is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Tuesday, May 28, 2024 1:57 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Kuehling, Felix > <Felix.Kuehling@xxxxxxx>; Kasiviswanathan, Harish > <Harish.Kasiviswanathan@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report > capacity v2 > > Add estimate of how much vram we need to reserve for RAS when > caculating the total available vram. > > v2: apply the change to MP0 v13_0_2 and v13_0_14 > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ > 3 files changed, 31 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index e98927529f61..ad813772f8a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > size_t system_mem_needed, ttm_mem_needed, vram_needed; > int ret = 0; > uint64_t vram_size = 0; > @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, > (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > > kfd_mem_limit.max_ttm_mem_limit) || > (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + > vram_needed > > - vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size) > + > + vram_size - reserved_for_pt - reserved_for_ras - > +atomic64_read(&adev->vram_pin_size) + > atomic64_read(&adev->kfd.vram_pinned))) { > ret = -ENOMEM; > goto release; > @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > ssize_t available; > uint64_t vram_available, system_mem_available, > ttm_mem_available; > > @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, > - adev->kfd.vram_used_aligned[xcp_id] > - atomic64_read(&adev->vram_pin_size) > + atomic64_read(&adev->kfd.vram_pinned) > - - reserved_for_pt; > + - reserved_for_pt > + - reserved_for_ras; > > if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { > system_mem_available = no_system_mem_limit ? > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ecce022c657b..f28bf5765380 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3317,6 +3317,24 @@ static void amdgpu_ras_event_mgr_init(struct > amdgpu_device *adev) > amdgpu_put_xgmi_hive(hive); } > > +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device > +*adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + > + if (!con || (adev->flags & AMD_IS_APU)) > + return; > + > + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { > + case IP_VERSION(13, 0, 2): > + case IP_VERSION(13, 0, 6): > + case IP_VERSION(13, 0, 14): > + con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE; > + break; > + default: > + break; > + } > +} > + > int amdgpu_ras_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ > -3422,6 > +3440,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > /* Get RAS schema for particular SOC */ > con->schema = amdgpu_get_ras_schema(adev); > > + amdgpu_ras_init_reserved_vram_size(adev); > + > if (amdgpu_ras_fs_init(adev)) { > r = -EINVAL; > goto release_con; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 6a8c7b1609df..bd61f77a7134 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -64,6 +64,9 @@ struct amdgpu_iv_entry; #define > AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define > AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 > > +/* Reserve 8 physical dram row for possible retirement. > + * In worst cases, it will lose 8 * 2MB memory in vram domain */ > +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) > /* The high three bits indicates socketid */ #define > AMDGPU_RAS_GET_FEATURES(val) ((val) & > ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) > > @@ -541,6 +544,7 @@ struct amdgpu_ras { > struct ras_event_manager __event_mgr; > struct ras_event_manager *event_mgr; > > + uint64_t reserved_pages_in_bytes; > }; > > struct ras_fs_data { > -- > 2.17.1