<Ping> On 11/18/2024 11:14 AM, Lijo Lazar wrote: > Before scheduling a recovery due to scheduler/job hang, check if a RAS > error is detected. If so, choose RAS recovery to handle the situation. A > scheduler/job hang could be the side effect of a RAS error. In such > cases, it is required to go through the RAS error recovery process. A > RAS error recovery process in certains cases also could avoid a full > device device reset. > > An error state is maintained in RAS context to detect the block > affected. Fatal Error state uses unused block id. Set the block id when > error is detected. If the interrupt handler detected a poison error, > it's not required to look for a fatal error. Skip fatal error checking > in such cases. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 ++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++- > .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + > 5 files changed, 78 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > index 3a588fecb0c5..6a2fd9e4f470 100644 > --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c > +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > @@ -332,6 +332,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, > list_for_each_entry(tmp_adev, reset_device_list, reset_list) { > dev_info(tmp_adev->dev, > "GPU reset succeeded, trying to resume\n"); > + /*TBD: Ideally should clear only GFX, SDMA blocks*/ > + amdgpu_ras_clear_err_state(tmp_adev); > r = aldebaran_mode2_restore_ip(tmp_adev); > if (r) > goto end; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index b3ca911e55d6..b4464f42d72a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5165,7 +5165,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > if (r) > return r; > > - amdgpu_ras_set_fed(adev, false); > + amdgpu_ras_clear_err_state(adev); > amdgpu_irq_gpu_reset_resume_helper(adev); > > /* some sw clean up VF needs to do before recover */ > @@ -5460,7 +5460,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) > amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); > if (full_reset) { > /* post card */ > - amdgpu_ras_set_fed(tmp_adev, false); > + amdgpu_ras_clear_err_state(tmp_adev); > r = amdgpu_device_asic_init(tmp_adev); > if (r) { > dev_warn(tmp_adev->dev, "asic atom init failed!"); > @@ -5786,6 +5786,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > bool audio_suspended = false; > int retry_limit = AMDGPU_MAX_RETRY_LIMIT; > > + /* > + * If it reaches here because of hang/timeout and a RAS error is > + * detected at the same time, let RAS recovery take care of it. > + */ > + if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && > + reset_context->src != AMDGPU_RESET_SRC_RAS) { > + dev_dbg(adev->dev, > + "Gpu recovery from source: %d yielding to RAS error recovery handling", > + reset_context->src); > + return 0; > + } > /* > * Special case: RAS triggered and full reset isn't supported > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 1bc95b0cdbb8..e31b12144577 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2156,6 +2156,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) > /* Fatal error events are handled on host side */ > if (amdgpu_sriov_vf(adev)) > return; > + /** > + * If the current interrupt is caused by a non-fatal RAS error, skip > + * check for fatal error. For fatal errors, FED status of all devices > + * in XGMI hive gets set when the first device gets fatal error > + * interrupt. The error gets propagated to other devices as well, so > + * make sure to ack the interrupt regardless of FED status. > + */ > + if (!amdgpu_ras_get_fed_status(adev) && > + amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) > + return; > > if (adev->nbio.ras && > adev->nbio.ras->handle_ras_controller_intr_no_bifring) > @@ -2185,6 +2195,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > if (ret) > return; > > + amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); > /* both query_poison_status and handle_poison_consumption are optional, > * but at least one of them should be implemented if we need poison > * consumption handler > @@ -4083,16 +4094,56 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) > if (!ras) > return false; > > - return atomic_read(&ras->fed); > + return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); > } > > void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) > { > struct amdgpu_ras *ras; > > + ras = amdgpu_ras_get_context(adev); > + if (ras) { > + if (status) > + set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); > + else > + clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); > + } > +} > + > +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) > +{ > + struct amdgpu_ras *ras; > + > ras = amdgpu_ras_get_context(adev); > if (ras) > - atomic_set(&ras->fed, !!status); > + ras->ras_err_state = 0; > +} > + > +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, > + enum amdgpu_ras_block block) > +{ > + struct amdgpu_ras *ras; > + > + ras = amdgpu_ras_get_context(adev); > + if (ras) > + set_bit(block, &ras->ras_err_state); > +} > + > +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) > +{ > + struct amdgpu_ras *ras; > + > + ras = amdgpu_ras_get_context(adev); > + if (ras) { > + if (block == AMDGPU_RAS_BLOCK__ANY) > + return (ras->ras_err_state != 0); > + else > + return test_bit(block, &ras->ras_err_state) || > + test_bit(AMDGPU_RAS_BLOCK__LAST, > + &ras->ras_err_state); > + } > + > + return false; > } > > static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 6db772ecfee4..b13debcf48ee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -99,7 +99,8 @@ enum amdgpu_ras_block { > AMDGPU_RAS_BLOCK__IH, > AMDGPU_RAS_BLOCK__MPIO, > > - AMDGPU_RAS_BLOCK__LAST > + AMDGPU_RAS_BLOCK__LAST, > + AMDGPU_RAS_BLOCK__ANY = -1 > }; > > enum amdgpu_ras_mca_block { > @@ -558,8 +559,8 @@ struct amdgpu_ras { > struct ras_ecc_log_info umc_ecc_log; > struct delayed_work page_retirement_dwork; > > - /* Fatal error detected flag */ > - atomic_t fed; > + /* ras errors detected */ > + unsigned long ras_err_state; > > /* RAS event manager */ > struct ras_event_manager __event_mgr; > @@ -952,6 +953,10 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a > > void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); > bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); > +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, > + enum amdgpu_ras_block block); > +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev); > +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block); > > u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); > int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index d46a13156ee9..0cb5c582ce7d 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -184,6 +184,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, > } else { > reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > } > + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX); > break; > case SOC15_IH_CLIENTID_VMC: > case SOC15_IH_CLIENTID_VMC1: > @@ -213,6 +214,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, > } else { > reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; > } > + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA); > break; > default: > dev_warn(dev->adev->dev,