[AMD Official Use Only - General] I can remove the support for SOC15_IH_CLIENTID_VMC from v10, but the reset type should be changed from bool to uint32 for all versions. Regards, Tao > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Sunday, March 17, 2024 6:10 PM > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: RE: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS > poison > > [AMD Official Use Only - General] > > Let's not copy kfd interrupt handler and the work queue implementation from v9 > to v10 since the firmware/hardware design are totally different. > > We shall have another patch to fix kfd int v10 for poison consumption handling > and also v11. > > Regards, > Hawking > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou > Sent: Wednesday, March 13, 2024 17:12 > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS > poison > > Each RAS block has different requirement for gpu reset in poison consumption > handling. > Add support for mmhub RAS poison consumption handling. > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++-- > .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 20 ++++++++++++++----- > .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++++++++++++++----- > 7 files changed, 42 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 9687650b0fe3..262d20167039 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device > *adev) } > > void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device > *adev, > - enum amdgpu_ras_block block, bool reset) > + enum amdgpu_ras_block block, uint32_t reset) > { > amdgpu_umc_poison_handler(adev, block, reset); } diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 03bf20e0e3da..ad50c7bbc326 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct > amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct > amdgpu_device *adev, > struct tile_config *config); void > amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device > *adev, > - enum amdgpu_ras_block block, bool reset); > + enum amdgpu_ras_block block, uint32_t reset); > bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool > amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct > kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index e32a186c2de1..58fe7bebdf1b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2045,7 +2045,7 @@ static void > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > } > } > > - amdgpu_umc_poison_handler(adev, obj->head.block, false); > + amdgpu_umc_poison_handler(adev, obj->head.block, 0); > > if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) > poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); > @@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void > *param) > atomic_dec(&con->page_retirement_req_cnt); > > amdgpu_umc_bad_page_polling_timeout(adev, > - false, MAX_UMC_POISON_POLLING_TIME_ASYNC); > + 0, MAX_UMC_POISON_POLLING_TIME_ASYNC); > } > > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 20436f81856a..2c02585dcbff 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct > amdgpu_device *adev, > amdgpu_umc_handle_bad_pages(adev, ras_error_status); > > if (err_data->ue_count && reset) { > - /* use mode-2 reset for poison consumption */ > - if (!entry) > - con->gpu_reset_flags |= > AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + con->gpu_reset_flags |= reset; > amdgpu_ras_reset_gpu(adev); > } > > @@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct > amdgpu_device *adev, } > > int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, > - bool reset, uint32_t timeout_ms) > + uint32_t reset, uint32_t timeout_ms) > { > struct ras_err_data err_data; > struct ras_common_if head = { > @@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct > amdgpu_device *adev, > if (reset) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > - /* use mode-2 reset for poison consumption */ > - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + con->gpu_reset_flags |= reset; > amdgpu_ras_reset_gpu(adev); > } > > @@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct > amdgpu_device *adev, } > > int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > - enum amdgpu_ras_block block, bool reset) > + enum amdgpu_ras_block block, uint32_t reset) > { > int ret = AMDGPU_RAS_SUCCESS; > > @@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct > amdgpu_device *adev, > void *ras_error_status, > struct amdgpu_iv_entry *entry) { > - return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, > true); > + return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, > + AMDGPU_RAS_GPU_RESET_MODE1_RESET); > } > > int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > index 26d2ae498daf..4365a20eeb49 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > @@ -103,7 +103,7 @@ struct amdgpu_umc { > int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int > amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if > *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > - enum amdgpu_ras_block block, bool reset); > + enum amdgpu_ras_block block, uint32_t reset); > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry); @@ -123,5 +123,5 @@ int > amdgpu_umc_loop_channels(struct amdgpu_device *adev, > umc_func func, void *data); > > int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, > - bool reset, uint32_t timeout_ms); > + uint32_t reset, uint32_t timeout_ms); > #endif > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c > index 650da18b0d87..94ab1f33fc4a 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c > @@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct > kfd_node *dev, { > enum amdgpu_ras_block block = 0; > int old_poison, ret = -EINVAL; > + uint32_t reset = 0; > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > > if (!p) > @@ -153,6 +154,15 @@ static void event_interrupt_poison_consumption(struct > kfd_node *dev, > case SOC15_IH_CLIENTID_UTCL2: > ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > block = AMDGPU_RAS_BLOCK__GFX; > + if (ret) > + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + break; > + case SOC15_IH_CLIENTID_VMC: > + case SOC15_IH_CLIENTID_VMC1: > + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > + block = AMDGPU_RAS_BLOCK__MMHUB; > + if (ret) > + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > break; > case SOC15_IH_CLIENTID_SDMA0: > case SOC15_IH_CLIENTID_SDMA1: > @@ -160,6 +170,7 @@ static void event_interrupt_poison_consumption(struct > kfd_node *dev, > case SOC15_IH_CLIENTID_SDMA3: > case SOC15_IH_CLIENTID_SDMA4: > block = AMDGPU_RAS_BLOCK__SDMA; > + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > break; > default: > break; > @@ -170,17 +181,16 @@ static void event_interrupt_poison_consumption(struct > kfd_node *dev, > /* resetting queue passes, do page retirement without gpu reset > * resetting queue fails, fallback to gpu reset solution > */ > - if (!ret) { > + if (!ret) > dev_warn(dev->adev->dev, > "RAS poison consumption, unmap queue flow succeeded: client > id %d\n", > client_id); > - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > false); > - } else { > + else > dev_warn(dev->adev->dev, > "RAS poison consumption, fall back to gpu reset flow: client > id %d\n", > client_id); > - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > true); > - } > + > + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > + reset); > } > > static bool event_interrupt_isr_v10(struct kfd_node *dev, diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index 11641f4645e6..2a37ab7a7150 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -145,6 +145,7 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, { > enum amdgpu_ras_block block = 0; > int old_poison, ret = -EINVAL; > + uint32_t reset = 0; > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > > if (!p) > @@ -164,6 +165,15 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > case SOC15_IH_CLIENTID_UTCL2: > ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > block = AMDGPU_RAS_BLOCK__GFX; > + if (ret) > + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + break; > + case SOC15_IH_CLIENTID_VMC: > + case SOC15_IH_CLIENTID_VMC1: > + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > + block = AMDGPU_RAS_BLOCK__MMHUB; > + if (ret) > + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > break; > case SOC15_IH_CLIENTID_SDMA0: > case SOC15_IH_CLIENTID_SDMA1: > @@ -171,6 +181,7 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > case SOC15_IH_CLIENTID_SDMA3: > case SOC15_IH_CLIENTID_SDMA4: > block = AMDGPU_RAS_BLOCK__SDMA; > + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > break; > default: > break; > @@ -181,17 +192,16 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > /* resetting queue passes, do page retirement without gpu reset > * resetting queue fails, fallback to gpu reset solution > */ > - if (!ret) { > + if (!ret) > dev_warn(dev->adev->dev, > "RAS poison consumption, unmap queue flow succeeded: client > id %d\n", > client_id); > - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > false); > - } else { > + else > dev_warn(dev->adev->dev, > "RAS poison consumption, fall back to gpu reset flow: client > id %d\n", > client_id); > - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > true); > - } > + > + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > + reset); > } > > static bool context_id_expected(struct kfd_dev *dev) > -- > 2.34.1 >