On 8/29/19 3:56 AM, Zhou1, Tao wrote: > >> -----Original Message----- >> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of >> Andrey Grodzovsky >> Sent: 2019年8月29日 4:00 >> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> Cc: alexdeucher@xxxxxxxxx; ckoenig.leichtzumerken@xxxxxxxxx; >> Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Zhang, Hawking >> <Hawking.Zhang@xxxxxxx> >> Subject: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS. >> >> Problem: >> Under certain conditions, when some IP bocks take a RAS error, we can get > [Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks" > >> into a situation where a GPU reset is not possible due to issues in RAS in >> SMU/PSP. >> >> Temporary fix until proper solution in PSP/SMU is ready: >> When uncorrectable error happens the DF will unconditionally broadcast >> error event packets to all its clients/slave upon receiving fatal error event and >> freeze all its outbound queues, err_event_athub interrupt will be triggered. >> In such case and we use this interrupt >> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW >> reset, only stops schedulers, deatches all in progress and not yet scheduled >> job's fences, set error code on them and signals. >> Also reject any new incoming job submissions from user space. >> All this is done to notify the applications of the problem. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 >> ++++++++++++++++++++++-------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++++++-- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++- >> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +-- >> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++---- >> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++ >> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++----- >> 10 files changed, 164 insertions(+), 62 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index 9da681e..300adb8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -38,6 +38,7 @@ >> #include "amdgpu_gmc.h" >> #include "amdgpu_gem.h" >> #include "amdgpu_display.h" >> +#include "amdgpu_ras.h" >> >> #if defined(HAVE_DRM_FREE_LARGE) >> #define kvfree drm_free_large >> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void >> *data, struct drm_file *filp) >> bool reserved_buffers = false; >> int i, r; >> >> + if (amdgpu_ras_intr_triggered()) >> + return -EHWPOISON; >> + >> if (!adev->accel_working) >> return -EBUSY; >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 07a4ba0..3ecee10 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct >> amdgpu_device *adev, bool trylock) >> return true; >> } >> >> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) >> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool >> +skip_kfd) >> { >> /*unlock kfd: SRIOV would do it separately */ >> - if (!amdgpu_sriov_vf(adev)) >> + if (!amdgpu_sriov_vf(adev) && !skip_kfd) >> amdgpu_amdkfd_post_reset(adev); >> amdgpu_vf_error_trans_all(adev); >> adev->mp1_state = PP_MP1_STATE_NONE; >> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct >> amdgpu_device *adev) } >> >> >> +#define to_drm_sched_job(sched_job) \ >> + container_of((sched_job), struct drm_sched_job, >> queue_node) >> + >> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler >> +*sched) { >> + struct drm_sched_job *s_job; >> + struct drm_sched_entity *s_entity = NULL; >> + int i; >> + >> + /* Signal all jobs not yet scheduled */ >> + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= >> DRM_SCHED_PRIORITY_MIN; i--) { >> + struct drm_sched_rq *rq = &sched->sched_rq[i]; >> + >> + if (!rq) >> + continue; >> + >> + spin_lock(&rq->lock); >> + list_for_each_entry(s_entity, &rq->entities, list) { >> + while ((s_job = >> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { >> + struct drm_sched_fence *s_fence = s_job- >>> s_fence; >> + >> + dma_fence_signal(&s_fence->scheduled); >> + dma_fence_set_error(&s_fence->finished, - >> EHWPOISON); >> + dma_fence_signal(&s_fence->finished); >> + } >> + } >> + spin_unlock(&rq->lock); >> + } >> + >> + /* Signal all jobs already scheduled to HW */ >> + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { >> + struct drm_sched_fence *s_fence = s_job->s_fence; >> + >> + dma_fence_set_error(&s_fence->finished, -EHWPOISON); >> + dma_fence_signal(&s_fence->finished); >> + } >> +} >> + >> /** >> * amdgpu_device_gpu_recover - reset the asic and recover scheduler >> * >> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> struct amdgpu_hive_info *hive = NULL; >> struct amdgpu_device *tmp_adev = NULL; >> int i, r = 0; >> + bool in_ras_intr = amdgpu_ras_intr_triggered(); >> >> need_full_reset = job_signaled = false; >> INIT_LIST_HEAD(&device_list); >> >> - dev_info(adev->dev, "GPU reset begin!\n"); >> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs >> +stop":"reset"); >> >> cancel_delayed_work_sync(&adev->delayed_init_work); >> >> @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> /* Build list of devices to reset */ >> if (adev->gmc.xgmi.num_physical_nodes > 1) { >> if (!hive) { >> - amdgpu_device_unlock_adev(adev); >> + amdgpu_device_unlock_adev(adev, false); >> return -ENODEV; >> } >> >> @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> /* block all schedulers and reset given job's ring */ >> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >> /* disable ras on ALL IPs */ >> - if (amdgpu_device_ip_need_full_reset(tmp_adev)) >> + if (!in_ras_intr && >> amdgpu_device_ip_need_full_reset(tmp_adev)) >> amdgpu_ras_suspend(tmp_adev); >> >> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3834,10 >> +3873,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >> continue; >> >> drm_sched_stop(&ring->sched, job ? &job->base : >> NULL); >> + >> + if (in_ras_intr) >> + amdgpu_stop_all_jobs_on_sched(&ring- >>> sched); >> } >> } >> >> >> + if (in_ras_intr) >> + goto skip_hw_reset; >> + >> /* >> * Must check guilty signal here since after this point all old >> * HW fences are force signaled. >> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> >> /* Post ASIC reset for all devs .*/ >> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >> - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> - struct amdgpu_ring *ring = tmp_adev->rings[i]; >> >> - if (!ring || !ring->sched.thread) >> - continue; >> + if (!in_ras_intr) { >> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >> + struct amdgpu_ring *ring = tmp_adev- >>> rings[i]; >> - /* No point to resubmit jobs if we didn't HW reset*/ >> - if (!tmp_adev->asic_reset_res && !job_signaled) >> - drm_sched_resubmit_jobs(&ring->sched); >> + if (!ring || !ring->sched.thread) >> + continue; >> >> - drm_sched_start(&ring->sched, !tmp_adev- >>> asic_reset_res); >> - } >> + /* No point to resubmit jobs if we didn't HW >> reset*/ >> + if (!tmp_adev->asic_reset_res >> && !job_signaled) >> + drm_sched_resubmit_jobs(&ring- >>> sched); >> - if (!amdgpu_device_has_dc_support(tmp_adev) >> && !job_signaled) { >> - drm_helper_resume_force_mode(tmp_adev->ddev); >> - } >> + drm_sched_start(&ring->sched, !tmp_adev- >>> asic_reset_res); >> + } >> >> - tmp_adev->asic_reset_res = 0; >> + if (!amdgpu_device_has_dc_support(tmp_adev) >> && !job_signaled) { >> + drm_helper_resume_force_mode(tmp_adev- >>> ddev); >> + } >> >> - if (r) { >> - /* bad news, how to tell it to userspace ? */ >> - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", >> atomic_read(&adev->gpu_reset_counter)); >> - amdgpu_vf_error_put(tmp_adev, >> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); >> - } else { >> - dev_info(tmp_adev->dev, "GPU reset(%d) >> succeeded!\n", atomic_read(&adev->gpu_reset_counter)); >> + tmp_adev->asic_reset_res = 0; >> + >> + if (r) { >> + /* bad news, how to tell it to userspace ? */ >> + dev_info(tmp_adev->dev, "GPU reset(%d) >> failed\n", atomic_read(&adev->gpu_reset_counter)); >> + amdgpu_vf_error_put(tmp_adev, >> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); >> + } else { >> + dev_info(tmp_adev->dev, "GPU reset(%d) >> succeeded!\n", atomic_read(&adev->gpu_reset_counter)); >> + } >> } >> >> - amdgpu_device_unlock_adev(tmp_adev); >> + amdgpu_device_unlock_adev(tmp_adev, in_ras_intr); >> } >> >> if (hive) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index 151d7f2..757fd6d 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -40,6 +40,8 @@ >> >> #include "amdgpu_amdkfd.h" >> >> +#include "amdgpu_ras.h" >> + >> /* >> * KMS wrapper. >> * - 3.0.0 - initial driver >> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) >> struct drm_device *dev = pci_get_drvdata(pdev); >> struct amdgpu_device *adev = dev->dev_private; >> >> + if (amdgpu_ras_intr_triggered()) >> + return; >> + >> /* if we are running in a VM, make sure the device >> * torn down properly on reboot/shutdown. >> * unfortunately we can't detect certain diff --git >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> index da2143d..ced766c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device >> *dev, struct drm_file *file_priv) >> /* Ensure IB tests are run on ring */ >> flush_delayed_work(&adev->delayed_init_work); >> >> + >> + if (amdgpu_ras_intr_triggered()) { >> + DRM_ERROR("RAS Intr triggered, device disabled!!"); >> + return -EHWPOISON; >> + } >> + >> file_priv->driver_priv = NULL; >> >> r = pm_runtime_get_sync(dev->dev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> index 2d5897a..086e6df 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> @@ -24,6 +24,8 @@ >> #include <linux/debugfs.h> >> #include <linux/list.h> >> #include <linux/module.h> >> +#include <linux/reboot.h> >> +#include <linux/syscalls.h> >> #include "amdgpu.h" >> #include "amdgpu_ras.h" >> #include "amdgpu_atomfirmware.h" >> @@ -64,6 +66,9 @@ const char *ras_block_string[] = { >> /* inject address is 52 bits */ >> #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) >> >> + >> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); >> + >> static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, >> uint64_t offset, uint64_t size, >> struct amdgpu_bo **bo_ptr); >> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, >> char __user *buf, >> ssize_t s; >> char val[128]; >> >> - if (amdgpu_ras_error_query(obj->adev, &info)) >> + if (amdgpu_ras_error_query(obj->adev, &info, false)) >> return -EINVAL; >> >> s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -188,6 >> +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, >> >> return 0; >> } >> + >> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device >> *adev, >> + struct ras_common_if *head); >> + >> /** >> * DOC: AMDGPU RAS debugfs control interface >> * >> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device >> *dev, >> .head = obj->head, >> }; >> >> - if (amdgpu_ras_error_query(obj->adev, &info)) >> + if (amdgpu_ras_error_query(obj->adev, &info, false)) >> return -EINVAL; >> >> return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", @@ -591,7 >> +600,7 @@ static int amdgpu_ras_enable_all_features(struct >> amdgpu_device *adev, >> >> /* query/inject/cure begin */ >> int amdgpu_ras_error_query(struct amdgpu_device *adev, >> - struct ras_query_if *info) >> + struct ras_query_if *info, bool print) >> { >> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); >> struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -627,12 +636,14 >> @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, >> info->ue_count = obj->err_data.ue_count; >> info->ce_count = obj->err_data.ce_count; >> >> - if (err_data.ce_count) >> + if (err_data.ce_count || print) { >> dev_info(adev->dev, "%ld correctable errors detected in %s >> block\n", >> obj->err_data.ce_count, ras_block_str(info- >>> head.block)); > [Tao] Could you explain why print is needed even ce/ue_count == 0? And I think these codes can be split into a single patch. I will just remove it, at first we planned to also dump all CE/CU counters but I don't do it eventually. > >> - if (err_data.ue_count) >> + } >> + if (err_data.ue_count || print) { >> dev_info(adev->dev, "%ld uncorrectable errors detected >> in %s block\n", >> obj->err_data.ue_count, ras_block_str(info- >>> head.block)); >> + } >> >> return 0; >> } >> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct >> amdgpu_device *adev, >> .head = obj->head, >> }; >> >> - if (amdgpu_ras_error_query(adev, &info)) >> + if (amdgpu_ras_error_query(adev, &info, true)) >> return -EINVAL; >> >> data.ce_count += info.ce_count; >> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) >> >> return 0; >> } >> + >> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { >> + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { >> + DRM_WARN("RAS event of type >> ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); >> + } >> +} >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> index 5a0df73..c0e22af 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct >> amdgpu_device *adev, >> struct ras_common_if *head); >> >> int amdgpu_ras_error_query(struct amdgpu_device *adev, >> - struct ras_query_if *info); >> + struct ras_query_if *info, bool print); >> >> int amdgpu_ras_error_inject(struct amdgpu_device *adev, >> struct ras_inject_if *info); >> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct >> amdgpu_device *adev, >> >> int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, >> struct ras_dispatch_if *info); >> + >> +extern atomic_t amdgpu_ras_in_intr; >> + >> +static inline bool amdgpu_ras_intr_triggered(void) { >> + return !!atomic_read(&amdgpu_ras_in_intr); >> +} >> + >> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); >> + >> #endif >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >> index b2c86a0..e7a83f6 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct >> amdgpu_device *adev, >> struct amdgpu_iv_entry *entry) >> { >> /* TODO ue will trigger an interrupt. */ >> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> - if (adev->gfx.funcs->query_ras_error_count) >> - adev->gfx.funcs->query_ras_error_count(adev, err_data); >> - amdgpu_ras_reset_gpu(adev, 0); >> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > [Tao] Have you encountered any error without the check? ras_data_cb would not be registered if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) We have a requirement to not to handle block specific interrupts in case of expecting a sync flood which will trigger err_event_athub interrupt which is exactly the case for when RAS GFX is enabled. > >> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> + if (adev->gfx.funcs->query_ras_error_count) >> + adev->gfx.funcs->query_ras_error_count(adev, >> err_data); >> + amdgpu_ras_reset_gpu(adev, 0); >> + } >> return AMDGPU_RAS_SUCCESS; >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> index 43b4fbc..87a66c2 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct >> amdgpu_device *adev, >> struct ras_err_data *err_data, >> struct amdgpu_iv_entry *entry) >> { >> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> - if (adev->umc.funcs->query_ras_error_count) >> - adev->umc.funcs->query_ras_error_count(adev, err_data); >> - /* umc query_ras_error_address is also responsible for clearing >> - * error status >> - */ >> - if (adev->umc.funcs->query_ras_error_address) >> - adev->umc.funcs->query_ras_error_address(adev, err_data); >> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > [Tao] AMDGPU_RAS_BLOCK__UMC See above explanation > >> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> + if (adev->umc.funcs->query_ras_error_count) >> + adev->umc.funcs->query_ras_error_count(adev, >> err_data); >> + /* umc query_ras_error_address is also responsible for >> clearing >> + * error status >> + */ >> + if (adev->umc.funcs->query_ras_error_address) >> + adev->umc.funcs->query_ras_error_address(adev, >> err_data); >> >> - /* only uncorrectable error needs gpu reset */ >> - if (err_data->ue_count) >> - amdgpu_ras_reset_gpu(adev, 0); >> + /* only uncorrectable error needs gpu reset */ >> + if (err_data->ue_count) >> + amdgpu_ras_reset_gpu(adev, 0); >> + } >> >> return AMDGPU_RAS_SUCCESS; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >> index 367f9d6..545990c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >> @@ -30,6 +30,7 @@ >> #include "nbio/nbio_7_4_0_smn.h" >> #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" >> #include <uapi/linux/kfd_ioctl.h> >> +#include "amdgpu_ras.h" >> >> #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c >> >> @@ -329,6 +330,8 @@ static void >> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device >> BIF_DOORBELL_INT_CNTL, >> >> RAS_CNTLR_INTERRUPT_CLEAR, 1); >> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, >> bif_doorbell_intr_cntl); >> + >> + amdgpu_ras_global_ras_isr(adev); >> } >> } >> >> @@ -344,6 +347,8 @@ static void >> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d >> BIF_DOORBELL_INT_CNTL, >> >> RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); >> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, >> bif_doorbell_intr_cntl); >> + >> + amdgpu_ras_global_ras_isr(adev); >> } >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >> index 956432f..438e504 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >> @@ -1972,24 +1972,26 @@ static int >> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, >> uint32_t err_source; >> int instance; >> >> - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); >> - if (instance < 0) >> - return 0; >> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > [Tao] AMDGPU_RAS_BLOCK__SDMA See above explanation Andrey > >> + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); >> + if (instance < 0) >> + return 0; >> >> - switch (entry->src_id) { >> - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: >> - err_source = 0; >> - break; >> - case SDMA0_4_0__SRCID__SDMA_ECC: >> - err_source = 1; >> - break; >> - default: >> - return 0; >> - } >> + switch (entry->src_id) { >> + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: >> + err_source = 0; >> + break; >> + case SDMA0_4_0__SRCID__SDMA_ECC: >> + err_source = 1; >> + break; >> + default: >> + return 0; >> + } >> >> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >> >> - amdgpu_ras_reset_gpu(adev, 0); >> + amdgpu_ras_reset_gpu(adev, 0); >> + } >> >> return AMDGPU_RAS_SUCCESS; >> } >> -- >> 2.7.4 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx