PF will do page retirement, reset VF and inform VF to reserve RAS bad pages. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 18 +++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..b603ab3bd138 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -123,12 +123,20 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!amdgpu_sriov_vf(adev)) { + ret = amdgpu_umc_do_page_retirement(adev, + ras_error_status, NULL, reset); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } + } else { + if (adev->virt.ops && adev->virt.ops->ras_poison_handler) + adev->virt.ops->ras_poison_handler(adev); + else + dev_warn(adev->dev, + "No ras_poison_handler interface in SRIOV!\n"); } return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index d94c31e68a14..87de4ef60f86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -84,6 +84,7 @@ struct amdgpu_virt_ops { int (*reset_gpu)(struct amdgpu_device *adev); int (*wait_reset)(struct amdgpu_device *adev); void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3); + void (*ras_poison_handler)(struct amdgpu_device *adev); }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index a2f04b249132..fd814a099066 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -405,6 +405,11 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev) return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); } +void amdgpu_virt_ras_poison_handler(struct amdgpu_device *adev) +{ + xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); +} + const struct amdgpu_virt_ops xgpu_ai_virt_ops = { .req_full_gpu = xgpu_ai_request_full_gpu_access, .rel_full_gpu = xgpu_ai_release_full_gpu_access, @@ -412,4 +417,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = { .wait_reset = NULL, .trans_msg = xgpu_ai_mailbox_trans_msg, .req_init_data = xgpu_ai_request_init_data, + .ras_poison_handler = amdgpu_virt_ras_poison_handler, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index fa7e13e0459e..188181b0af4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -39,6 +39,7 @@ enum idh_request { IDH_LOG_VF_ERROR = 200, IDH_READY_TO_RESET = 201, + IDH_RAS_POISON = 202, }; enum idh_event { -- 2.35.1