Am 12.03.24 um 18:50 schrieb Victor Skvortsov:
Use amdgpu_vram_mgr to reserve bad page ranges.
Reserved ranges will be freed by amdgpu_vram_mgr_fini()
Delete bo_create path as it is redundant.
Suggested-by: Christian König <christian.koenig@xxxxxxx>
Signed-off-by: Victor Skvortsov <victor.skvortsov@xxxxxxx>
Acked-by: Christian König <christian.koenig@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 55 ++----------------------
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 -
2 files changed, 3 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 7a4eae36778a..2a20714b9c16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -244,7 +244,6 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
*/
unsigned int align_space = 512;
void *bps = NULL;
- struct amdgpu_bo **bps_bo = NULL;
*data = kmalloc(sizeof(struct amdgpu_virt_ras_err_handler_data), GFP_KERNEL);
if (!*data)
@@ -254,12 +253,7 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
if (!bps)
goto bps_failure;
- bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);
- if (!bps_bo)
- goto bps_bo_failure;
-
(*data)->bps = bps;
- (*data)->bps_bo = bps_bo;
(*data)->count = 0;
(*data)->last_reserved = 0;
@@ -267,34 +261,12 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
return 0;
-bps_bo_failure:
- kfree(bps);
bps_failure:
kfree(*data);
data_failure:
return -ENOMEM;
}
-static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
-{
- struct amdgpu_virt *virt = &adev->virt;
- struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
- struct amdgpu_bo *bo;
- int i;
-
- if (!data)
- return;
-
- for (i = data->last_reserved - 1; i >= 0; i--) {
- bo = data->bps_bo[i];
- if (bo) {
- amdgpu_bo_free_kernel(&bo, NULL, NULL);
- data->bps_bo[i] = bo;
- }
- data->last_reserved = i;
- }
-}
-
void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
@@ -305,10 +277,7 @@ void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev)
if (!data)
return;
- amdgpu_virt_ras_release_bp(adev);
-
kfree(data->bps);
- kfree(data->bps_bo);
kfree(data);
virt->virt_eh_data = NULL;
}
@@ -330,9 +299,6 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
- struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
- struct ttm_resource_manager *man = &mgr->manager;
- struct amdgpu_bo *bo = NULL;
uint64_t bp;
int i;
@@ -341,26 +307,11 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
for (i = data->last_reserved; i < data->count; i++) {
bp = data->bps[i].retired_page;
+ if (amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+ bp << AMDGPU_GPU_PAGE_SHIFT, AMDGPU_GPU_PAGE_SIZE))
+ DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
- /* There are two cases of reserve error should be ignored:
- * 1) a ras bad page has been allocated (used by someone);
- * 2) a ras bad page has been reserved (duplicate error injection
- * for one page);
- */
- if (ttm_resource_manager_used(man)) {
- amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
- bp << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE);
- data->bps_bo[i] = NULL;
- } else {
- if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE,
- &bo, NULL))
- DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
- data->bps_bo[i] = bo;
- }
data->last_reserved = i + 1;
- bo = NULL;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 3f59b7b5523f..15599951e7b8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -224,8 +224,6 @@ struct amdgim_vf2pf_info_v2 {
struct amdgpu_virt_ras_err_handler_data {
/* point to bad page records array */
struct eeprom_table_record *bps;
- /* point to reserved bo array */
- struct amdgpu_bo **bps_bo;
/* the count of entries */
int count;
/* last reserved entry's index + 1 */