On Fri, May 21, 2021 at 5:19 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote: > > On Context Query2 IOCTL return the correctable and > uncorrectable errors in O(1) fashion, from cached > values, and schedule a delayed work function to > calculate and cache them for the next such IOCTL. Patches 1, 2, are: Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> For patch 3, I think we need to cancel any outstanding delayed work in ras_fini(). Other than that, it looks good to me. Alex > > Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> > Cc: Christian König <christian.koenig@xxxxxxx> > Cc: John Clements <john.clements@xxxxxxx> > Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ > 3 files changed, 73 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > index bb0cfe871aba..4e95d255960b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c > @@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, > return 0; > } > > +#define AMDGPU_RAS_COUNTE_DELAY_MS 3000 > + > static int amdgpu_ctx_query2(struct amdgpu_device *adev, > - struct amdgpu_fpriv *fpriv, uint32_t id, > - union drm_amdgpu_ctx_out *out) > + struct amdgpu_fpriv *fpriv, uint32_t id, > + union drm_amdgpu_ctx_out *out) > { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct amdgpu_ctx *ctx; > struct amdgpu_ctx_mgr *mgr; > > @@ -361,6 +364,31 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, > if (atomic_read(&ctx->guilty)) > out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; > > + if (adev->ras_enabled && con) { > + /* Return the cached values in O(1), > + * and schedule delayed work to cache > + * new vaues. > + */ > + int ce_count, ue_count; > + > + ce_count = atomic_read(&con->ras_ce_count); > + ue_count = atomic_read(&con->ras_ue_count); > + > + if (ce_count != ctx->ras_counter_ce) { > + ctx->ras_counter_ce = ce_count; > + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; > + } > + > + if (ue_count != ctx->ras_counter_ue) { > + ctx->ras_counter_ue = ue_count; > + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; > + } > + > + if (!delayed_work_pending(&con->ras_counte_delay_work)) > + schedule_delayed_work(&con->ras_counte_delay_work, > + msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS)); > + } > + > mutex_unlock(&mgr->lock); > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ed3c43e8b0b5..80f576098318 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -27,6 +27,7 @@ > #include <linux/uaccess.h> > #include <linux/reboot.h> > #include <linux/syscalls.h> > +#include <linux/pm_runtime.h> > > #include "amdgpu.h" > #include "amdgpu_ras.h" > @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) > adev->ras_hw_enabled & amdgpu_ras_mask; > } > > +static void amdgpu_ras_counte_dw(struct work_struct *work) > +{ > + struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, > + ras_counte_delay_work.work); > + struct amdgpu_device *adev = con->adev; > + struct drm_device *dev = &adev->ddev; > + unsigned long ce_count, ue_count; > + int res; > + > + res = pm_runtime_get_sync(dev->dev); > + if (res < 0) > + goto Out; > + > + /* Cache new values. > + */ > + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); > + atomic_set(&con->ras_ce_count, ce_count); > + atomic_set(&con->ras_ue_count, ue_count); > + > + pm_runtime_mark_last_busy(dev->dev); > +Out: > + pm_runtime_put_autosuspend(dev->dev); > +} > + > int amdgpu_ras_init(struct amdgpu_device *adev) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > if (!con) > return -ENOMEM; > > + con->adev = adev; > + INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); > + atomic_set(&con->ras_ce_count, 0); > + atomic_set(&con->ras_ue_count, 0); > + > con->objs = (struct ras_manager *)(con + 1); > > amdgpu_ras_set_context(adev, con); > @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, > struct ras_fs_if *fs_info, > struct ras_ih_if *ih_info) > { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + unsigned long ue_count, ce_count; > int r; > > /* disable RAS feature per IP block if it is not supported */ > @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, > if (r) > goto sysfs; > > + /* Those are the cached values at init. > + */ > + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); > + atomic_set(&con->ras_ce_count, ce_count); > + atomic_set(&con->ras_ue_count, ue_count); > + > return 0; > cleanup: > amdgpu_ras_sysfs_remove(adev, ras_block); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 10fca0393106..256cea5d34f2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -340,6 +340,11 @@ struct amdgpu_ras { > > /* disable ras error count harvest in recovery */ > bool disable_ras_err_cnt_harvest; > + > + /* RAS count errors delayed work */ > + struct delayed_work ras_counte_delay_work; > + atomic_t ras_ue_count; > + atomic_t ras_ce_count; > }; > > struct ras_fs_data { > -- > 2.31.1.527.g2d677e5b15 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx