[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Chai, Thomas <YiPeng.Chai@xxxxxxx> Sent: Thursday, June 20, 2024 13:40 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Subject: [PATCH V2 4/4] drm/amdgpu: add gpu reset check and exception handling Add gpu reset check and exception handling for page retirement. v2: Clear poison consumption messages cached in fifo after non mode-1 reset. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a4030dc12a1c..ce7c7723e626 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1384,10 +1384,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i memset(&qctx, 0, sizeof(qctx)); qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + ret = amdgpu_ras_query_error_status_helper(adev, info, &err_data, &qctx, error_query_mode); + up_read(&adev->reset_domain->sem); if (ret) goto out_fini_err_data; @@ -2916,6 +2921,14 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, return 0; } +static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_poison_msg msg; + + while (kfifo_get(&con->poison_fifo, &msg)); } + static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, uint32_t msg_count, uint32_t *gpu_reset) { @@ -2946,12 +2959,20 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, else reset = reset_flags; + /* If gpu reset is ongoing, not need to reset gpu again */ + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + up_read(&adev->reset_domain->sem); + flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); *gpu_reset = reset; + + /* Wait for gpu recovery to complete */ + flush_work(&con->recovery_work); } return 0; @@ -3000,6 +3021,37 @@ static int amdgpu_ras_page_retirement_thread(void *param) } } + if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { + /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ + /* Clear poison creation request */ + atomic_set(&con->poison_creation_count, 0); + + /* Clear poison fifo */ + amdgpu_ras_clear_poison_fifo(adev); + + /* Clear all poison requests*/ + atomic_set(&con->page_retirement_req_cnt, 0); + + if (ret == -EIO) { + /* Wait for mode-1 reset to complete */ + down_read(&adev->reset_domain->sem); + up_read(&adev->reset_domain->sem); + } + + /* Wake up work to save bad pages to eeprom */ + schedule_delayed_work(&con->page_retirement_dwork, 0); + } else if (gpu_reset) { + /* gpu just completed mode-2 reset or other reset */ + /* Clear poison consumption messages cached in fifo */ + msg_count = kfifo_len(&con->poison_fifo); + if (msg_count) { + amdgpu_ras_clear_poison_fifo(adev); + atomic_sub(msg_count, &con->page_retirement_req_cnt); + } + + /* Wake up work to save bad pages to eeprom */ + schedule_delayed_work(&con->page_retirement_dwork, 0); + } #else dev_info(adev->dev, "Start processing page retirement. request:%d\n", atomic_read(&con->page_retirement_req_cnt)); -- 2.34.1