On 2021-05-25 6:03 p.m., Alex Deucher wrote: > On Fri, May 21, 2021 at 5:19 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote: >> On Context Query2 IOCTL return the correctable and >> uncorrectable errors in O(1) fashion, from cached >> values, and schedule a delayed work function to >> calculate and cache them for the next such IOCTL. > Patches 1, 2, are: > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > > For patch 3, I think we need to cancel any outstanding delayed work in > ras_fini(). Other than that, it looks good to me. Ah, yes, good point--I missed that. I'll add it and resubmit. Regards, Luben > > Alex > >> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> >> Cc: Christian König <christian.koenig@xxxxxxx> >> Cc: John Clements <john.clements@xxxxxxx> >> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> >> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++++++++++++++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +++++++++++++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ >> 3 files changed, 73 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> index bb0cfe871aba..4e95d255960b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> @@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, >> return 0; >> } >> >> +#define AMDGPU_RAS_COUNTE_DELAY_MS 3000 >> + >> static int amdgpu_ctx_query2(struct amdgpu_device *adev, >> - struct amdgpu_fpriv *fpriv, uint32_t id, >> - union drm_amdgpu_ctx_out *out) >> + struct amdgpu_fpriv *fpriv, uint32_t id, >> + union drm_amdgpu_ctx_out *out) >> { >> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> struct amdgpu_ctx *ctx; >> struct amdgpu_ctx_mgr *mgr; >> >> @@ -361,6 +364,31 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, >> if (atomic_read(&ctx->guilty)) >> out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; >> >> + if (adev->ras_enabled && con) { >> + /* Return the cached values in O(1), >> + * and schedule delayed work to cache >> + * new vaues. >> + */ >> + int ce_count, ue_count; >> + >> + ce_count = atomic_read(&con->ras_ce_count); >> + ue_count = atomic_read(&con->ras_ue_count); >> + >> + if (ce_count != ctx->ras_counter_ce) { >> + ctx->ras_counter_ce = ce_count; >> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; >> + } >> + >> + if (ue_count != ctx->ras_counter_ue) { >> + ctx->ras_counter_ue = ue_count; >> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; >> + } >> + >> + if (!delayed_work_pending(&con->ras_counte_delay_work)) >> + schedule_delayed_work(&con->ras_counte_delay_work, >> + msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS)); >> + } >> + >> mutex_unlock(&mgr->lock); >> return 0; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> index ed3c43e8b0b5..80f576098318 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> @@ -27,6 +27,7 @@ >> #include <linux/uaccess.h> >> #include <linux/reboot.h> >> #include <linux/syscalls.h> >> +#include <linux/pm_runtime.h> >> >> #include "amdgpu.h" >> #include "amdgpu_ras.h" >> @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) >> adev->ras_hw_enabled & amdgpu_ras_mask; >> } >> >> +static void amdgpu_ras_counte_dw(struct work_struct *work) >> +{ >> + struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, >> + ras_counte_delay_work.work); >> + struct amdgpu_device *adev = con->adev; >> + struct drm_device *dev = &adev->ddev; >> + unsigned long ce_count, ue_count; >> + int res; >> + >> + res = pm_runtime_get_sync(dev->dev); >> + if (res < 0) >> + goto Out; >> + >> + /* Cache new values. >> + */ >> + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); >> + atomic_set(&con->ras_ce_count, ce_count); >> + atomic_set(&con->ras_ue_count, ue_count); >> + >> + pm_runtime_mark_last_busy(dev->dev); >> +Out: >> + pm_runtime_put_autosuspend(dev->dev); >> +} >> + >> int amdgpu_ras_init(struct amdgpu_device *adev) >> { >> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) >> if (!con) >> return -ENOMEM; >> >> + con->adev = adev; >> + INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); >> + atomic_set(&con->ras_ce_count, 0); >> + atomic_set(&con->ras_ue_count, 0); >> + >> con->objs = (struct ras_manager *)(con + 1); >> >> amdgpu_ras_set_context(adev, con); >> @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, >> struct ras_fs_if *fs_info, >> struct ras_ih_if *ih_info) >> { >> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> + unsigned long ue_count, ce_count; >> int r; >> >> /* disable RAS feature per IP block if it is not supported */ >> @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, >> if (r) >> goto sysfs; >> >> + /* Those are the cached values at init. >> + */ >> + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); >> + atomic_set(&con->ras_ce_count, ce_count); >> + atomic_set(&con->ras_ue_count, ue_count); >> + >> return 0; >> cleanup: >> amdgpu_ras_sysfs_remove(adev, ras_block); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> index 10fca0393106..256cea5d34f2 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> @@ -340,6 +340,11 @@ struct amdgpu_ras { >> >> /* disable ras error count harvest in recovery */ >> bool disable_ras_err_cnt_harvest; >> + >> + /* RAS count errors delayed work */ >> + struct delayed_work ras_counte_delay_work; >> + atomic_t ras_ue_count; >> + atomic_t ras_ce_count; >> }; >> >> struct ras_fs_data { >> -- >> 2.31.1.527.g2d677e5b15 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Cluben.tuikov%40amd.com%7C16860b04040649fe81d208d91fc8fb00%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637575770340862619%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=4KIyoPOrvCCC9ljQQhJlxKPhjONiFK%2FEAHNXEc30BtQ%3D&reserved=0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx