add amdgpu smu mca dump feature support. Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 68 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 57 +++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 8d9ff9e151de..9fa88ae81b12 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -142,3 +142,71 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) return 0; } + +void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) +{ + struct amdgpu_mca *mca = &adev->mca; + + mca->mca_funcs = mca_funcs; +} + +int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return mca_funcs->mca_set_debug_mode(adev, enable); + + return -ENOTSUPP; +} + +int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!count) + return -EINVAL; + + if (mca_funcs && mca_funcs->mca_get_valid_mca_count) + return mca_funcs->mca_get_valid_mca_count(adev, type, count); + + return -ENOTSUPP; +} + +int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, uint32_t *count) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + if (!count) + return -EINVAL; + + if (mca_funcs && mca_funcs->mca_get_error_count) + return mca_funcs->mca_get_error_count(adev, blk, type, count); + + return -ENOTSUPP; +} + +int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev,enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + int count; + + switch (type) { + case AMDGPU_MCA_ERROR_TYPE_UE: + count = mca_funcs->max_ue_count; + break; + case AMDGPU_MCA_ERROR_TYPE_CE: + count = mca_funcs->max_ce_count; + break; + default: + return -EINVAL; + } + + if (idx >= count) + return -EINVAL; + + if (mca_funcs && mca_funcs->mca_get_mca_entry) + return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); + + return -ENOTSUPP; +} + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index 997a073e2409..6915ae0d5b92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -21,6 +21,26 @@ #ifndef __AMDGPU_MCA_H__ #define __AMDGPU_MCA_H__ +#include "amdgpu_ras.h" + +#define MCA_MAX_REGS_COUNT (16) + +enum amdgpu_mca_ip { + AMDGPU_MCA_IP_UNKNOW = -1, + AMDGPU_MCA_IP_PSP = 0, + AMDGPU_MCA_IP_SDMA, + AMDGPU_MCA_IP_GC, + AMDGPU_MCA_IP_SMU, + AMDGPU_MCA_IP_MP5, + AMDGPU_MCA_IP_UMC, + AMDGPU_MCA_IP_COUNT, +}; + +enum amdgpu_mca_error_type { + AMDGPU_MCA_ERROR_TYPE_UE = 0, + AMDGPU_MCA_ERROR_TYPE_CE, +}; + struct amdgpu_mca_ras_block { struct amdgpu_ras_block_object ras_block; }; @@ -34,6 +54,36 @@ struct amdgpu_mca { struct amdgpu_mca_ras mp0; struct amdgpu_mca_ras mp1; struct amdgpu_mca_ras mpio; + const struct amdgpu_mca_smu_funcs *mca_funcs; +}; + +struct mca_bank_info { + int socket_id; + int aid; + int hwid; + int mcatype; +}; + +struct mca_bank_entry { + int idx; + enum amdgpu_mca_error_type type; + enum amdgpu_mca_ip ip; + struct mca_bank_info info; + uint64_t regs[MCA_MAX_REGS_COUNT]; +}; + +struct amdgpu_mca_smu_funcs { + int max_ue_count; + int max_ce_count; + int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); + int (*mca_get_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, uint32_t *count); + int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, + uint32_t *count); + int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, + int idx, struct mca_bank_entry *entry); + int (*mca_get_ras_mca_idx_array)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size); }; void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, @@ -53,4 +103,11 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); + +void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); +int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); +int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count); +int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, uint32_t *count); +int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev,enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry); + #endif -- 2.34.1