[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Chai, Thomas <YiPeng.Chai@xxxxxxx> Sent: Thursday, June 20, 2024 13:40 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Subject: [PATCH V2 3/4] drm/amdgpu: refine poison consumption interrupt handler 1. The poison fifo is only used for poison consumption requests. 2. Merge reset requests when poison fifo caches multiple poison consumption messages Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 ++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 +++--- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 308348b4644f..a4030dc12a1c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2917,23 +2917,41 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, } static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, - struct ras_poison_msg *poison_msg) + uint32_t msg_count, uint32_t *gpu_reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t reset = poison_msg->reset; - uint16_t pasid = poison_msg->pasid; + uint32_t reset_flags = 0, reset = 0; + struct ras_poison_msg msg; + int ret, i; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (poison_msg->pasid_fn) - poison_msg->pasid_fn(adev, pasid, poison_msg->data); + for (i = 0; i < msg_count; i++) { + ret = amdgpu_ras_get_poison_req(adev, &msg); + if (!ret) + continue; + + if (msg.pasid_fn) + msg.pasid_fn(adev, msg.pasid, msg.data); + + reset_flags |= msg.reset; + } /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset && !con->is_rma) { + if (reset_flags && !con->is_rma) { + if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + else + reset = reset_flags; + flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); + + *gpu_reset = reset; } return 0; @@ -2943,11 +2961,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) { struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t poison_creation_count; + uint32_t poison_creation_count, msg_count; + uint32_t gpu_reset; int ret; - struct ras_poison_msg poison_msg; - enum amdgpu_ras_block ras_block; - bool poison_creation_is_handled = false; while (!kthread_should_stop()) { @@ -2958,6 +2974,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; + gpu_reset = 0; do { poison_creation_count = atomic_read(&con->poison_creation_count); @@ -2972,16 +2989,17 @@ static int amdgpu_ras_page_retirement_thread(void *param) } while (atomic_read(&con->poison_creation_count)); #ifdef HAVE_KFIFO_PUT_NON_POINTER - if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) - continue; - - ras_block = poison_msg.block; - - dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", - ras_block_str(ras_block), ras_block); - + if (ret != -EIO) { + msg_count = kfifo_len(&con->poison_fifo); + if (msg_count) { + ret = amdgpu_ras_poison_consumption_handler(adev, + msg_count, &gpu_reset); + if ((ret != -EIO) && + (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) + atomic_sub(msg_count, &con->page_retirement_req_cnt); + } + } - amdgpu_ras_poison_consumption_handler(adev, &poison_msg); #else dev_info(adev->dev, "Start processing page retirement. request:%d\n", atomic_read(&con->page_retirement_req_cnt)); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 4a72ff8d8d80..38e7793137ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -296,13 +296,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); #ifdef HAVE_KFIFO_PUT_NON_POINTER - amdgpu_ras_put_poison_req(adev, + int ret; + + ret = amdgpu_ras_put_poison_req(adev, block, pasid, pasid_fn, data, reset); + if (!ret) { + atomic_inc(&con->page_retirement_req_cnt); + wake_up(&con->page_retirement_wq); + } #endif - - atomic_inc(&con->page_retirement_req_cnt); - - wake_up(&con->page_retirement_wq); } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) -- 2.34.1