[AMD Official Use Only - AMD Internal Distribution Only] Hi Tao, We don't plan to apply the change to gfx adapters. And it's only applicable to aldebran and aqua_vanjaram. I will add back aldebran in v2. Regards, Hawking -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Tuesday, May 28, 2024 10:41 To: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> Subject: RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity [AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > Hawking Zhang > Sent: Tuesday, May 28, 2024 10:21 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Kuehling, Felix > <Felix.Kuehling@xxxxxxx>; Kasiviswanathan, Harish > <Harish.Kasiviswanathan@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report > capacity > > Add estimate of how much vram we need to reserve for RAS when > caculating the total available vram. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 ++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ > 3 files changed, 27 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index e98927529f61..ad813772f8a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > size_t system_mem_needed, ttm_mem_needed, vram_needed; > int ret = 0; > uint64_t vram_size = 0; > @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > amdgpu_device *adev, > (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > > kfd_mem_limit.max_ttm_mem_limit) || > (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + > vram_needed > > - vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size) > + > + vram_size - reserved_for_pt - reserved_for_ras - > +atomic64_read(&adev->vram_pin_size) + > atomic64_read(&adev->kfd.vram_pinned))) { > ret = -ENOMEM; > goto release; > @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, { > uint64_t reserved_for_pt = > ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes > + : 0); > ssize_t available; > uint64_t vram_available, system_mem_available, > ttm_mem_available; > > @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct > amdgpu_device *adev, > - adev->kfd.vram_used_aligned[xcp_id] > - atomic64_read(&adev->vram_pin_size) > + atomic64_read(&adev->kfd.vram_pinned) > - - reserved_for_pt; > + - reserved_for_pt > + - reserved_for_ras; > > if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) { > system_mem_available = no_system_mem_limit ? > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ecce022c657b..a6334e0e62dc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3317,6 +3317,22 @@ static void amdgpu_ras_event_mgr_init(struct > amdgpu_device *adev) > amdgpu_put_xgmi_hive(hive); } > > +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device > +*adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + > + if (!con || (adev->flags & AMD_IS_APU)) > + return; > + > + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { > + case IP_VERSION(13, 0, 6): [Tao] can we apply the change for all ASICs which support RAS? > + con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE; > + break; > + default: > + break; > + } > +} > + > int amdgpu_ras_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ > -3422,6 > +3438,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > /* Get RAS schema for particular SOC */ > con->schema = amdgpu_get_ras_schema(adev); > > + amdgpu_ras_init_reserved_vram_size(adev); > + > if (amdgpu_ras_fs_init(adev)) { > r = -EINVAL; > goto release_con; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 6a8c7b1609df..bee622c4268a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -64,6 +64,7 @@ struct amdgpu_iv_entry; #define > AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define > AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 > > +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) [Tao] it's better to add comment here to explain why the value is 16MB. > /* The high three bits indicates socketid */ #define > AMDGPU_RAS_GET_FEATURES(val) ((val) & > ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) > > @@ -541,6 +542,7 @@ struct amdgpu_ras { > struct ras_event_manager __event_mgr; > struct ras_event_manager *event_mgr; > > + uint64_t reserved_pages_in_bytes; > }; > > struct ras_fs_data { > -- > 2.17.1