[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Sent: Monday, August 12, 2024 11:26 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Add debug option to enable mode2 for poison > recovery > > Add debug option to enable mode2 for poison recovery for testing purpose only. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++ > drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++++++++++------ > 3 files changed, 17 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e6b641cb362a..c34819f947ed 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1201,6 +1201,7 @@ struct amdgpu_device { > bool debug_disable_soft_recovery; > bool debug_use_vram_fw_buf; > bool debug_enable_ras_aca; > + bool debug_mode2_for_poison_recovery; > }; > > static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index afe3b8bd35a1..be6b920933d6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK { > AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2), > AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3), > AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4), > + AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5), > }; > > unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2229,6 +2230,11 @@ static > void amdgpu_init_debug_options(struct amdgpu_device *adev) > pr_info("debug: enable RAS ACA\n"); > adev->debug_enable_ras_aca = true; > } > + > + if (amdgpu_debug_mask & > AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) { > + pr_info("debug: enable mode2 reset for poison consumption > recovery"); > + adev->debug_mode2_for_poison_recovery = true; > + } > } > > static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long > flags) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index 816800555f7f..a355b2bc2214 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -164,10 +164,12 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > case SOC15_IH_CLIENTID_SE3SH: > case SOC15_IH_CLIENTID_UTCL2: > block = AMDGPU_RAS_BLOCK__GFX; > - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == > IP_VERSION(9, 4, 3)) > - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > - else > + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == > IP_VERSION(9, 4, 3)) { > + reset = ((dev->adev- > >debug_mode2_for_poison_recovery) ? > + AMDGPU_RAS_GPU_RESET_MODE2_RESET : > AMDGPU_RAS_GPU_RESET_MODE1_RESET); [Tao] can we apply the debug option for all ASICs? > + } else { > reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + } > break; > case SOC15_IH_CLIENTID_VMC: > case SOC15_IH_CLIENTID_VMC1: > @@ -180,10 +182,12 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > case SOC15_IH_CLIENTID_SDMA3: > case SOC15_IH_CLIENTID_SDMA4: > block = AMDGPU_RAS_BLOCK__SDMA; > - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == > IP_VERSION(9, 4, 3)) > - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; > - else > + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == > IP_VERSION(9, 4, 3)) { > + reset = ((dev->adev- > >debug_mode2_for_poison_recovery) ? > + AMDGPU_RAS_GPU_RESET_MODE2_RESET : > AMDGPU_RAS_GPU_RESET_MODE1_RESET); > + } else { > reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > + } > break; > default: > dev_warn(dev->adev->dev, > -- > 2.17.1