[AMD Official Use Only - General] I assume we are leveraging error_query_mode to differentiate aca path from legacy ras path, right? But given in-band error reporting is just the start of transition from legacy ras to aca, do we need a flag in amdgpu_aca to indicate whether aca is supported or not? Accordingly, we can initialize the flag in amdgpu_ras_check_supported. it should help us to differentiate aca from legacy ras when implementing other features, thoughts? Regards, Hawking -----Original Message----- From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Sent: Wednesday, January 3, 2024 16:02 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Subject: [PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface use new ACA error query interface to instead of legacy MCA query. Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 88 ++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 038bd1b17cef..bbae41f86e00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1168,6 +1168,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s } } +static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, +enum amdgpu_ras_block blk) { + struct ras_common_if head; + + memset(&head, 0, sizeof(head)); + head.block = blk; + + return amdgpu_ras_find_obj(adev, &head); } + +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + const struct aca_info *aca_info, void *data) { + struct ras_manager *obj; + + obj = get_ras_manager(adev, blk); + if (!obj) + return -EINVAL; + + return amdgpu_aca_add_handle(adev, &obj->aca_handle, +ras_block_str(blk), aca_info, data); } + +int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum +amdgpu_ras_block blk) { + struct ras_manager *obj; + + obj = get_ras_manager(adev, blk); + if (!obj) + return -EINVAL; + + amdgpu_aca_remove_handle(&obj->aca_handle); + + return 0; +} + +static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum aca_error_type type, struct ras_err_data *err_data) { + struct ras_manager *obj; + + obj = get_ras_manager(adev, blk); + if (!obj) + return -EINVAL; + + return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, +err_data); } + static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, struct ras_query_if *info, struct ras_err_data *err_data, @@ -1175,6 +1222,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, { enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; struct amdgpu_ras_block_object *block_obj = NULL; + int ret; if (blk == AMDGPU_RAS_BLOCK_COUNT) return -EINVAL; @@ -1204,9 +1252,13 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } } } else { - /* FIXME: add code to check return value later */ - amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data); - amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data); + ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data); + if (ret) + return ret; + + ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data); + if (ret) + return ret; } return 0; @@ -1254,7 +1306,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; struct amdgpu_hive_info *hive; int hive_ras_recovery = 0; @@ -1265,7 +1317,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, } if (!amdgpu_ras_is_supported(adev, block) || - !amdgpu_ras_get_mca_debug_mode(adev)) + !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; hive = amdgpu_get_xgmi_hive(adev); @@ -1277,7 +1329,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, /* skip ras error reset in gpu reset */ if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || hive_ras_recovery) && - mca_funcs && mca_funcs->mca_set_debug_mode) + smu_funcs && smu_funcs->set_debug_mode) return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) @@ -1773,7 +1825,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) } } - amdgpu_mca_smu_debugfs_init(adev, dir); + amdgpu_aca_smu_debugfs_init(adev, dir); } /* debugfs end */ @@ -3138,8 +3190,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; - /* enable MCA debug on APU device */ - amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU)); + /* enable ACA debug on APU device */ + amdgpu_ras_set_aca_debug_mode(adev, !!(adev->flags & AMD_IS_APU)); list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { if (!node->ras_obj) { @@ -3422,7 +3474,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) if (con) { ret = amdgpu_mca_smu_set_debug_mode(adev, enable); if (!ret) - con->is_mca_debug_mode = enable; + con->is_aca_debug_mode = enable; } return ret; @@ -3436,22 +3488,22 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) if (con) { ret = amdgpu_aca_smu_set_debug_mode(adev, enable); if (!ret) - con->is_mca_debug_mode = enable; + con->is_aca_debug_mode = enable; } return ret; } -bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; if (!con) return false; - if (mca_funcs && mca_funcs->mca_set_debug_mode) - return con->is_mca_debug_mode; + if (smu_funcs && smu_funcs->set_debug_mode) + return con->is_aca_debug_mode; else return true; } @@ -3460,16 +3512,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, unsigned int *error_query_mode) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; if (!con) { *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; return false; } - if (mca_funcs && mca_funcs->mca_set_debug_mode) + if (smu_funcs && smu_funcs->set_debug_mode) *error_query_mode = - (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; + (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : +AMDGPU_RAS_FIRMWARE_ERROR_QUERY; else *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 408e21c3cc88..2afac9aa381a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -442,7 +442,7 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; /* Record status of smu mca debug mode */ - bool is_mca_debug_mode; + bool is_aca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -530,6 +530,8 @@ struct ras_manager { struct ras_ih_data ih_data; struct ras_err_data err_data; + + struct aca_handle aca_handle; }; struct ras_badpage { @@ -781,9 +783,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); -int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable); int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); -bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); +int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool +enable); bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device +*adev); bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, unsigned int *mode); @@ -821,4 +823,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, struct ras_err_addr *err_addr, u64 count); +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + const struct aca_info *aca_info, void *data); int +amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block +blk); + #endif -- 2.34.1