On 2021-05-13 4:00 a.m., Christian König wrote: > Am 12.05.21 um 19:03 schrieb Luben Tuikov: >> When using Vega 20 with RAS support and RAS is >> enabled, the system interactivity is extremely >> slow, to the point of being unusable. After >> debugging, it was determined that this is due to >> the polling loop performed for >> AMDGPU_CTX_OP_QUERY_STATE2 under >> amdgpu_ctx_ioctl(), which seems to be executed on >> every ioctl from X/Mesa. >> >> The latter seems to call amdgpu_ctx_query2() which >> calls amdgpu_ras_query_error_count() twice, once >> for each of ue_count and ce_count. This is >> unnecessarily wasteful since >> amdgpu_ras_query_error_count() calculates both, >> but with the current interface it returns one or >> the other, depending on its Boolean input, when it >> can in fact return both, in a single invocation, >> if it had a better interface. >> >> Further down, the query_ras_error_count() callback >> is called, which could be quite a large polling >> loop, and very time consuming. For instance, >> gfx_v9_0_query_ras_error_count() is at least >> O(n^3). A similar situation is seen with >> umc_v6_1_query_ras_error_count(). >> >> This commit implements asynchronous RAS error >> count polling to that of the ioctl. A kernel >> thread polls the RAS error counters once in a >> while. The ioctl reads the values >> asynchronously. The poll frequency is a module >> parameter, with range [500, 10000] milliseconds, >> default 3000. >> >> v2: Enable setting the polling interval to 0, >> which disables the thread. > Please drop the module parameter, we already have way to many module > parameters and that doesn't add much value. No problem. So three seconds then? > > Then I would really prefer to implement this as a delayed work item instead. > > If you call schedule_delayed_work() from the amdgpu_ctx_query2() the > work item will only be started every X jiffies when the function is > actually used. Yes, you mentioned this in our meeting and I did reply to this then: I'd rather keep it a thread, because it shows intention, it shows that we've committed that this is work which we need done, periodically and that it is something we want to do--using a thread makes it visible, known, intentional. I don't mind using "delayed work", for "work" items which are one-off, or something intermittent, non-regular type of work, non-periodic type of work. Regards, Luben > > Regards, > Christian. > >> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> >> Cc: John Clements <john.clements@xxxxxxx> >> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> >> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> >> Reviewed-by: John Clements <john.clements@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 36 +++++++------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 66 ++++++++++++++++++++++--- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 +-- >> 5 files changed, 93 insertions(+), 28 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 3147c1c935c8..a269f778194f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -197,6 +197,7 @@ extern struct amdgpu_mgpu_info mgpu_info; >> extern int amdgpu_ras_enable; >> extern uint amdgpu_ras_mask; >> extern int amdgpu_bad_page_threshold; >> +extern uint amdgpu_ras_thread_poll_ms; >> extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer; >> extern int amdgpu_async_gfx_ring; >> extern int amdgpu_mcbp; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> index d481a33f4eaf..558e887e99b6 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c >> @@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, >> } >> >> static int amdgpu_ctx_query2(struct amdgpu_device *adev, >> - struct amdgpu_fpriv *fpriv, uint32_t id, >> - union drm_amdgpu_ctx_out *out) >> + struct amdgpu_fpriv *fpriv, uint32_t id, >> + union drm_amdgpu_ctx_out *out) >> { >> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> struct amdgpu_ctx *ctx; >> struct amdgpu_ctx_mgr *mgr; >> - unsigned long ras_counter; >> >> if (!fpriv) >> return -EINVAL; >> @@ -362,20 +362,22 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, >> if (atomic_read(&ctx->guilty)) >> out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; >> >> - /*query ue count*/ >> - /* ras_counter = amdgpu_ras_query_error_count(adev, false); */ >> - /* /\*ras counter is monotonic increasing*\/ */ >> - /* if (ras_counter != ctx->ras_counter_ue) { */ >> - /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */ >> - /* ctx->ras_counter_ue = ras_counter; */ >> - /* } */ >> - >> - /* /\*query ce count*\/ */ >> - /* ras_counter = amdgpu_ras_query_error_count(adev, true); */ >> - /* if (ras_counter != ctx->ras_counter_ce) { */ >> - /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */ >> - /* ctx->ras_counter_ce = ras_counter; */ >> - /* } */ >> + if (con) { >> + int ce_count, ue_count; >> + >> + ce_count = atomic_read(&con->ras_ce_count); >> + ue_count = atomic_read(&con->ras_ue_count); >> + >> + if (ce_count != ctx->ras_counter_ce) { >> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; >> + ctx->ras_counter_ce = ce_count; >> + } >> + >> + if (ue_count != ctx->ras_counter_ue) { >> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; >> + ctx->ras_counter_ue = ue_count; >> + } >> + } >> >> mutex_unlock(&mgr->lock); >> return 0; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index 877469d288f8..641c374b8525 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = { >> int amdgpu_ras_enable = -1; >> uint amdgpu_ras_mask = 0xffffffff; >> int amdgpu_bad_page_threshold = -1; >> +uint amdgpu_ras_thread_poll_ms = 3000; >> struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { >> .timeout_fatal_disable = false, >> .period = 0x0, /* default to 0x0 (timeout disable) */ >> @@ -534,6 +535,15 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444); >> MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))"); >> module_param_named(ras_enable, amdgpu_ras_enable, int, 0444); >> >> +/** >> + * DOC: ras_thread_poll (uint) >> + * Number of milliseconds between RAS poll for errors. >> + * Valid range 0 and [500,10000]. Set to 0 to disable the thread. >> + * Default: 3000. >> + */ >> +MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range 0 and [500,10000]. Set to 0 to disable the thread. Default: 3000."); >> +module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444); >> + >> /** >> * DOC: ras_mask (uint) >> * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1 >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> index b1c57a5b6e89..30bec289e9ad 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> @@ -1043,15 +1043,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, >> } >> >> /* get the total error counts on all IPs */ >> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> - bool is_ce) >> +static void amdgpu_ras_query_error_count(struct amdgpu_device *adev) >> { >> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> struct ras_manager *obj; >> - struct ras_err_data data = {0, 0}; >> + int ce_count, ue_count; >> >> if (!adev->ras_enabled || !con) >> - return 0; >> + return; >> + >> + ce_count = 0; >> + ue_count = 0; >> >> list_for_each_entry(obj, &con->head, node) { >> struct ras_query_if info = { >> @@ -1059,13 +1061,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> }; >> >> if (amdgpu_ras_query_error_status(adev, &info)) >> - return 0; >> + return; >> >> - data.ce_count += info.ce_count; >> - data.ue_count += info.ue_count; >> + ce_count += info.ce_count; >> + ue_count += info.ue_count; >> } >> >> - return is_ce ? data.ce_count : data.ue_count; >> + atomic_set(&con->ras_ce_count, ce_count); >> + atomic_set(&con->ras_ue_count, ue_count); >> } >> /* query/inject/cure end */ >> >> @@ -2109,6 +2112,49 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) >> adev->ras_hw_enabled & amdgpu_ras_mask; >> } >> >> +static int amdgpu_ras_thread(void *data) >> +{ >> + struct amdgpu_device *adev = data; >> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> + >> + con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms; >> + if (con->ras_thread_poll_ms == 0) { >> + atomic_set(&con->ras_ce_count, 0); >> + atomic_set(&con->ras_ue_count, 0); >> + return 0; >> + } else if (con->ras_thread_poll_ms < 500 || >> + con->ras_thread_poll_ms > 10000) { >> + con->ras_thread_poll_ms = 3000; >> + } >> + >> + while (1) { >> + if (kthread_should_stop()) >> + break; >> + if (kthread_should_park()) >> + kthread_parkme(); >> + >> + amdgpu_ras_query_error_count(adev); >> + msleep_interruptible(con->ras_thread_poll_ms); >> + } >> + >> + return 0; >> +} >> + >> +static int amdgpu_ras_thread_start(struct amdgpu_device *adev) >> +{ >> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> + struct task_struct *kt; >> + >> + kt = kthread_run(amdgpu_ras_thread, adev, >> + "amdras:%s", pci_name(adev->pdev)); >> + if (IS_ERR(kt)) >> + return PTR_ERR(kt); >> + >> + con->ras_thread = kt; >> + >> + return 0; >> +} >> + >> int amdgpu_ras_init(struct amdgpu_device *adev) >> { >> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> @@ -2182,6 +2228,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev) >> goto release_con; >> } >> >> + r = amdgpu_ras_thread_start(adev); >> + if (r) >> + goto release_con; >> + >> dev_info(adev->dev, "RAS INFO: ras initialized successfully, " >> "hardware ability[%x] ras_mask[%x]\n", >> adev->ras_hw_enabled, adev->ras_enabled); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> index 201fbdee1d09..fb9e4c7ab054 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> @@ -340,6 +340,11 @@ struct amdgpu_ras { >> >> /* disable ras error count harvest in recovery */ >> bool disable_ras_err_cnt_harvest; >> + >> + struct task_struct *ras_thread; >> + unsigned int ras_thread_poll_ms; >> + atomic_t ras_ue_count; >> + atomic_t ras_ce_count; >> }; >> >> struct ras_fs_data { >> @@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, >> void amdgpu_ras_resume(struct amdgpu_device *adev); >> void amdgpu_ras_suspend(struct amdgpu_device *adev); >> >> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> - bool is_ce); >> - >> /* error handling functions */ >> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, >> struct eeprom_table_record *bps, int pages); _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx