[AMD Official Use Only - General] Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > Sent: Monday, March 18, 2024 10:25 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: add ras event id support for ACA > > add ras event id support for ACA. > > Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 29 ++++++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++----- > 3 files changed, 23 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c > index 53ad76f590a1..ddcb68e60a73 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c > @@ -116,20 +116,22 @@ static struct aca_regs_dump { > {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK}, > }; > > -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, > struct aca_bank *bank) > +static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, > struct aca_bank *bank, > + struct ras_query_context *qctx) > { > + u64 event_id = qctx ? qctx->event_id: 0ULL; > int i; > > - dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events > logged\n"); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check > Architecture > +events logged\n"); > /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ > for (i = 0; i < ARRAY_SIZE(aca_regs); i++) > - dev_info(adev->dev, HW_ERR > "ACA[%02d/%02d].%s=0x%016llx\n", > - idx + 1, total, aca_regs[i].name, bank- > >regs[aca_regs[i].reg_idx]); > + RAS_EVENT_LOG(adev, event_id, HW_ERR > "ACA[%02d/%02d].%s=0x%016llx\n", > + idx + 1, total, aca_regs[i].name, > +bank->regs[aca_regs[i].reg_idx]); > } > > static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum > aca_smu_type type, > int start, int count, > - struct aca_banks *banks) > + struct aca_banks *banks, struct > ras_query_context *qctx) > { > struct amdgpu_aca *aca = &adev->aca; > const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -165,7 > +167,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device > *adev, enum aca_smu_ > > bank.type = type; > > - aca_smu_bank_dump(adev, i, count, &bank); > + aca_smu_bank_dump(adev, i, count, &bank, qctx); > > ret = aca_banks_add_bank(banks, &bank); > if (ret) > @@ -390,7 +392,7 @@ static bool aca_bank_should_update(struct > amdgpu_device *adev, enum aca_smu_type } > > static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type > type, > - bank_handler_t handler, void *data) > + bank_handler_t handler, struct ras_query_context > *qctx, void > +*data) > { > struct amdgpu_aca *aca = &adev->aca; > struct aca_banks banks; > @@ -412,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device > *adev, enum aca_smu_type type, > > aca_banks_init(&banks); > > - ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); > + ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx); > if (ret) > goto err_release_banks; > > @@ -489,7 +491,7 @@ static int aca_log_aca_error(struct aca_handle *handle, > enum aca_error_type type } > > static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle > *handle, enum aca_error_type type, > - struct ras_err_data *err_data) > + struct ras_err_data *err_data, struct > ras_query_context *qctx) > { > enum aca_smu_type smu_type; > int ret; > @@ -507,7 +509,7 @@ static int __aca_get_error_data(struct amdgpu_device > *adev, struct aca_handle *h > } > > /* udpate aca bank to aca source error_cache first */ > - ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, > NULL); > + ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, > +qctx, NULL); > if (ret) > return ret; > > @@ -523,7 +525,7 @@ static bool aca_handle_is_valid(struct aca_handle > *handle) } > > int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle > *handle, > - enum aca_error_type type, void *data) > + enum aca_error_type type, void *data, void *qctx) > { > struct ras_err_data *err_data = (struct ras_err_data *)data; > > @@ -536,7 +538,8 @@ int amdgpu_aca_get_error_data(struct amdgpu_device > *adev, struct aca_handle *han > if (!(BIT(type) & handle->mask)) > return 0; > > - return __aca_get_error_data(adev, handle, type, err_data); > + return __aca_get_error_data(adev, handle, type, err_data, > + (struct ras_query_context *)qctx); > } > > static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) @@ > -853,7 +856,7 @@ static int aca_dump_show(struct seq_file *m, enum > aca_smu_type type) > .idx = 0, > }; > > - return aca_banks_update(adev, type, handler_aca_bank_dump, (void > *)&context); > + return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, > (void > +*)&context); > } > > static int aca_dump_ce_show(struct seq_file *m, void *unused) diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h > index 674a5a9da862..247968d6a925 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h > @@ -198,7 +198,7 @@ int amdgpu_aca_add_handle(struct amdgpu_device > *adev, struct aca_handle *handle, > const char *name, const struct aca_info *aca_info, void > *data); void amdgpu_aca_remove_handle(struct aca_handle *handle); int > amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle > *handle, > - enum aca_error_type type, void *data); > + enum aca_error_type type, void *data, void *qctx); > int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en); > void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry > *root); int aca_error_cache_log_bank_error(struct aca_handle *handle, struct > aca_bank_info *info, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 357349fcfcca..41b9315099a2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1269,7 +1269,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device > *adev, enum amdgpu_ras_block blk) } > > static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum > amdgpu_ras_block blk, > - enum aca_error_type type, struct > ras_err_data *err_data) > + enum aca_error_type type, struct > ras_err_data *err_data, > + struct ras_query_context *qctx) > { > struct ras_manager *obj; > > @@ -1277,7 +1278,7 @@ static int amdgpu_aca_log_ras_error_data(struct > amdgpu_device *adev, enum amdgpu > if (!obj) > return -EINVAL; > > - return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, > err_data); > + return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, > +err_data, qctx); > } > > ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute > *attr, @@ -1334,15 +1335,15 @@ static int > amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, > } > } else { > if (amdgpu_aca_is_enabled(adev)) { > - ret = amdgpu_aca_log_ras_error_data(adev, blk, > ACA_ERROR_TYPE_UE, err_data); > + ret = amdgpu_aca_log_ras_error_data(adev, blk, > ACA_ERROR_TYPE_UE, > +err_data, qctx); > if (ret) > return ret; > > - ret = amdgpu_aca_log_ras_error_data(adev, blk, > ACA_ERROR_TYPE_CE, err_data); > + ret = amdgpu_aca_log_ras_error_data(adev, blk, > ACA_ERROR_TYPE_CE, > +err_data, qctx); > if (ret) > return ret; > > - ret = amdgpu_aca_log_ras_error_data(adev, blk, > ACA_ERROR_TYPE_DEFERRED, err_data); > + ret = amdgpu_aca_log_ras_error_data(adev, blk, > +ACA_ERROR_TYPE_DEFERRED, err_data, qctx); > if (ret) > return ret; > } else { > -- > 2.34.1