[AMD Official Use Only - General] Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Sent: Tuesday, April 16, 2024 2:16 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison > > mode-2 reset is the only reliable method that can get GC/SDMA back when > poison is consumed. mmhub requires > mode-1 reset. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 27 ++++++------------- > 1 file changed, 8 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index c368c70df3f4a..c3beb872adf8d 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -144,7 +144,7 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > uint16_t pasid, uint16_t client_id) { > enum amdgpu_ras_block block = 0; > - int old_poison, ret = -EINVAL; > + int old_poison; > uint32_t reset = 0; > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > > @@ -163,17 +163,13 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > case SOC15_IH_CLIENTID_SE2SH: > case SOC15_IH_CLIENTID_SE3SH: > case SOC15_IH_CLIENTID_UTCL2: > - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > block = AMDGPU_RAS_BLOCK__GFX; > - if (ret) > - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > break; > case SOC15_IH_CLIENTID_VMC: > case SOC15_IH_CLIENTID_VMC1: > - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); > block = AMDGPU_RAS_BLOCK__MMHUB; > - if (ret) > - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > break; > case SOC15_IH_CLIENTID_SDMA0: > case SOC15_IH_CLIENTID_SDMA1: > @@ -184,22 +180,15 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > break; > default: > - break; > + dev_warn(dev->adev->dev, > + "client %d does not support poison consumption\n", > client_id); > + return; > } > > kfd_signal_poison_consumed_event(dev, pasid); > > - /* resetting queue passes, do page retirement without gpu reset > - * resetting queue fails, fallback to gpu reset solution > - */ > - if (!ret) > - dev_warn(dev->adev->dev, > - "RAS poison consumption, unmap queue flow > succeeded: client id %d\n", > - client_id); > - else > - dev_warn(dev->adev->dev, > - "RAS poison consumption, fall back to gpu reset flow: > client id %d\n", > - client_id); > + dev_warn(dev->adev->dev, > + "poison is consumed by client %d, kick off gpu reset flow\n", > +client_id); > > amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, > reset); } > -- > 2.17.1