Am 29.08.19 um 16:03 schrieb Grodzovsky, Andrey: > On 8/29/19 3:30 AM, Christian König wrote: >> Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky: >>> Problem: >>> Under certain conditions, when some IP bocks take a RAS error, >>> we can get into a situation where a GPU reset is not possible >>> due to issues in RAS in SMU/PSP. >>> >>> Temporary fix until proper solution in PSP/SMU is ready: >>> When uncorrectable error happens the DF will unconditionally >>> broadcast error event packets to all its clients/slave upon >>> receiving fatal error event and freeze all its outbound queues, >>> err_event_athub interrupt will be triggered. >>> In such case and we use this interrupt >>> to issue GPU reset. THe GPU reset code is modified for such case to >>> avoid HW >>> reset, only stops schedulers, deatches all in progress and not yet >>> scheduled >>> job's fences, set error code on them and signals. >>> Also reject any new incoming job submissions from user space. >>> All this is done to notify the applications of the problem. >>> >>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 >>> ++++++++++++++++++++++-------- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++++++-- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++- >>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +-- >>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++---- >>> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++ >>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++----- >>> 10 files changed, 164 insertions(+), 62 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> index 9da681e..300adb8 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> @@ -38,6 +38,7 @@ >>> #include "amdgpu_gmc.h" >>> #include "amdgpu_gem.h" >>> #include "amdgpu_display.h" >>> +#include "amdgpu_ras.h" >>> #if defined(HAVE_DRM_FREE_LARGE) >>> #define kvfree drm_free_large >>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, >>> void *data, struct drm_file *filp) >>> bool reserved_buffers = false; >>> int i, r; >>> + if (amdgpu_ras_intr_triggered()) >>> + return -EHWPOISON; >>> + >>> if (!adev->accel_working) >>> return -EBUSY; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 07a4ba0..3ecee10 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct >>> amdgpu_device *adev, bool trylock) >>> return true; >>> } >>> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) >>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, >>> bool skip_kfd) >>> { >>> /*unlock kfd: SRIOV would do it separately */ >>> - if (!amdgpu_sriov_vf(adev)) >>> + if (!amdgpu_sriov_vf(adev) && !skip_kfd) >>> amdgpu_amdkfd_post_reset(adev); >> It's most likely better to completely remove the call to >> amdgpu_amdkfd_post_reset() here. > > Felix advised that the way to stop all KFD activity is simply to NOT > call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you > prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ? Yes, exactly. It doesn't seems to be related to the unlock operation in the first place, but rather only signals the KFD that the reset is completed. Christian. > > >>> amdgpu_vf_error_trans_all(adev); >>> adev->mp1_state = PP_MP1_STATE_NONE; >>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct >>> amdgpu_device *adev) >>> } >>> +#define to_drm_sched_job(sched_job) \ >>> + container_of((sched_job), struct drm_sched_job, queue_node) >>> + >>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler >>> *sched) >>> +{ >>> + struct drm_sched_job *s_job; >>> + struct drm_sched_entity *s_entity = NULL; >>> + int i; >>> + >>> + /* Signal all jobs not yet scheduled */ >>> + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= >>> DRM_SCHED_PRIORITY_MIN; i--) { >>> + struct drm_sched_rq *rq = &sched->sched_rq[i]; >>> + >>> + if (!rq) >>> + continue; >>> + >>> + spin_lock(&rq->lock); >>> + list_for_each_entry(s_entity, &rq->entities, list) { >>> + while ((s_job = >>> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { >>> + struct drm_sched_fence *s_fence = s_job->s_fence; >>> + >>> + dma_fence_signal(&s_fence->scheduled); >>> + dma_fence_set_error(&s_fence->finished, -EHWPOISON); >>> + dma_fence_signal(&s_fence->finished); >>> + } >>> + } >>> + spin_unlock(&rq->lock); >>> + } >>> + >>> + /* Signal all jobs already scheduled to HW */ >>> + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { >>> + struct drm_sched_fence *s_fence = s_job->s_fence; >>> + >>> + dma_fence_set_error(&s_fence->finished, -EHWPOISON); >>> + dma_fence_signal(&s_fence->finished); >>> + } >>> +} >> That might be better put into amdgpu_job.c. >> >> And I assume this is called only during GPU reset will the scheduler >> fully stopped? > > Yes > > >>> + >>> /** >>> * amdgpu_device_gpu_recover - reset the asic and recover scheduler >>> * >>> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> struct amdgpu_hive_info *hive = NULL; >>> struct amdgpu_device *tmp_adev = NULL; >>> int i, r = 0; >>> + bool in_ras_intr = amdgpu_ras_intr_triggered(); >>> need_full_reset = job_signaled = false; >>> INIT_LIST_HEAD(&device_list); >>> - dev_info(adev->dev, "GPU reset begin!\n"); >>> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs >>> stop":"reset"); >>> cancel_delayed_work_sync(&adev->delayed_init_work); >>> @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> /* Build list of devices to reset */ >>> if (adev->gmc.xgmi.num_physical_nodes > 1) { >>> if (!hive) { >>> - amdgpu_device_unlock_adev(adev); >>> + amdgpu_device_unlock_adev(adev, false); >>> return -ENODEV; >>> } >>> @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> /* block all schedulers and reset given job's ring */ >>> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >>> /* disable ras on ALL IPs */ >>> - if (amdgpu_device_ip_need_full_reset(tmp_adev)) >>> + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) >>> amdgpu_ras_suspend(tmp_adev); >>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>> @@ -3834,10 +3873,16 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> continue; >>> drm_sched_stop(&ring->sched, job ? &job->base : NULL); >>> + >>> + if (in_ras_intr) >>> + amdgpu_stop_all_jobs_on_sched(&ring->sched); >>> } >>> } >>> + if (in_ras_intr) >>> + goto skip_hw_reset; >>> + >>> /* >>> * Must check guilty signal here since after this point all old >>> * HW fences are force signaled. >>> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct >>> amdgpu_device *adev, >>> /* Post ASIC reset for all devs .*/ >>> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { >>> - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>> - struct amdgpu_ring *ring = tmp_adev->rings[i]; >>> - if (!ring || !ring->sched.thread) >>> - continue; >>> + if (!in_ras_intr) { >> Maybe write it like this: >> >> if (in_ras_intr) { >> amdgpu_device_unlock_adev(..) >> continue; >> } >> >> Or even better use a completely separate unlock loop. >> >> Christian. >> >>> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { >>> + struct amdgpu_ring *ring = tmp_adev->rings[i]; >>> - /* No point to resubmit jobs if we didn't HW reset*/ >>> - if (!tmp_adev->asic_reset_res && !job_signaled) >>> - drm_sched_resubmit_jobs(&ring->sched); >>> + if (!ring || !ring->sched.thread) >>> + continue; >>> - drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); >>> - } >>> + /* No point to resubmit jobs if we didn't HW reset*/ >>> + if (!tmp_adev->asic_reset_res && !job_signaled) >>> + drm_sched_resubmit_jobs(&ring->sched); >>> - if (!amdgpu_device_has_dc_support(tmp_adev) && >>> !job_signaled) { >>> - drm_helper_resume_force_mode(tmp_adev->ddev); >>> - } >>> + drm_sched_start(&ring->sched, >>> !tmp_adev->asic_reset_res); >>> + } >>> - tmp_adev->asic_reset_res = 0; >>> + if (!amdgpu_device_has_dc_support(tmp_adev) && >>> !job_signaled) { >>> + drm_helper_resume_force_mode(tmp_adev->ddev); >>> + } >>> - if (r) { >>> - /* bad news, how to tell it to userspace ? */ >>> - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", >>> atomic_read(&adev->gpu_reset_counter)); >>> - amdgpu_vf_error_put(tmp_adev, >>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); >>> - } else { >>> - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", >>> atomic_read(&adev->gpu_reset_counter)); >>> + tmp_adev->asic_reset_res = 0; >>> + >>> + if (r) { >>> + /* bad news, how to tell it to userspace ? */ >>> + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", >>> atomic_read(&adev->gpu_reset_counter)); >>> + amdgpu_vf_error_put(tmp_adev, >>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); >>> + } else { >>> + dev_info(tmp_adev->dev, "GPU reset(%d) >>> succeeded!\n", atomic_read(&adev->gpu_reset_counter)); >>> + } >>> } >>> - amdgpu_device_unlock_adev(tmp_adev); >>> + amdgpu_device_unlock_adev(tmp_adev, in_ras_intr); >>> } >>> if (hive) >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>> index 151d7f2..757fd6d 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>> @@ -40,6 +40,8 @@ >>> #include "amdgpu_amdkfd.h" >>> +#include "amdgpu_ras.h" >>> + >>> /* >>> * KMS wrapper. >>> * - 3.0.0 - initial driver >>> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) >>> struct drm_device *dev = pci_get_drvdata(pdev); >>> struct amdgpu_device *adev = dev->dev_private; >>> + if (amdgpu_ras_intr_triggered()) >>> + return; >>> + >>> /* if we are running in a VM, make sure the device >>> * torn down properly on reboot/shutdown. >>> * unfortunately we can't detect certain >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> index da2143d..ced766c 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device >>> *dev, struct drm_file *file_priv) >>> /* Ensure IB tests are run on ring */ >>> flush_delayed_work(&adev->delayed_init_work); >>> + >>> + if (amdgpu_ras_intr_triggered()) { >>> + DRM_ERROR("RAS Intr triggered, device disabled!!"); >>> + return -EHWPOISON; >>> + } >>> + >>> file_priv->driver_priv = NULL; >>> r = pm_runtime_get_sync(dev->dev); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >>> index 2d5897a..086e6df 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >>> @@ -24,6 +24,8 @@ >>> #include <linux/debugfs.h> >>> #include <linux/list.h> >>> #include <linux/module.h> >>> +#include <linux/reboot.h> >>> +#include <linux/syscalls.h> >>> #include "amdgpu.h" >>> #include "amdgpu_ras.h" >>> #include "amdgpu_atomfirmware.h" >>> @@ -64,6 +66,9 @@ const char *ras_block_string[] = { >>> /* inject address is 52 bits */ >>> #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) >>> + >>> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); >>> + >>> static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, >>> uint64_t offset, uint64_t size, >>> struct amdgpu_bo **bo_ptr); >>> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file >>> *f, char __user *buf, >>> ssize_t s; >>> char val[128]; >>> - if (amdgpu_ras_error_query(obj->adev, &info)) >>> + if (amdgpu_ras_error_query(obj->adev, &info, false)) >>> return -EINVAL; >>> s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", >>> @@ -188,6 +193,10 @@ static int >>> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, >>> return 0; >>> } >>> + >>> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device >>> *adev, >>> + struct ras_common_if *head); >>> + >>> /** >>> * DOC: AMDGPU RAS debugfs control interface >>> * >>> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct >>> device *dev, >>> .head = obj->head, >>> }; >>> - if (amdgpu_ras_error_query(obj->adev, &info)) >>> + if (amdgpu_ras_error_query(obj->adev, &info, false)) >>> return -EINVAL; >>> return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", >>> @@ -591,7 +600,7 @@ static int amdgpu_ras_enable_all_features(struct >>> amdgpu_device *adev, >>> /* query/inject/cure begin */ >>> int amdgpu_ras_error_query(struct amdgpu_device *adev, >>> - struct ras_query_if *info) >>> + struct ras_query_if *info, bool print) >>> { >>> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); >>> struct ras_err_data err_data = {0, 0, 0, NULL}; >>> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device >>> *adev, >>> info->ue_count = obj->err_data.ue_count; >>> info->ce_count = obj->err_data.ce_count; >>> - if (err_data.ce_count) >>> + if (err_data.ce_count || print) { >>> dev_info(adev->dev, "%ld correctable errors detected in %s >>> block\n", >>> obj->err_data.ce_count, ras_block_str(info->head.block)); >>> - if (err_data.ue_count) >>> + } >>> + if (err_data.ue_count || print) { >>> dev_info(adev->dev, "%ld uncorrectable errors detected in >>> %s block\n", >>> obj->err_data.ue_count, ras_block_str(info->head.block)); >>> + } >>> return 0; >>> } >>> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct >>> amdgpu_device *adev, >>> .head = obj->head, >>> }; >>> - if (amdgpu_ras_error_query(adev, &info)) >>> + if (amdgpu_ras_error_query(adev, &info, true)) >>> return -EINVAL; >>> data.ce_count += info.ce_count; >>> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) >>> return 0; >>> } >>> + >>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) >>> +{ >>> + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { >>> + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT >>> detected! Stopping all GPU jobs.\n"); >>> + } >>> +} >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >>> index 5a0df73..c0e22af 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >>> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct >>> amdgpu_device *adev, >>> struct ras_common_if *head); >>> int amdgpu_ras_error_query(struct amdgpu_device *adev, >>> - struct ras_query_if *info); >>> + struct ras_query_if *info, bool print); >>> int amdgpu_ras_error_inject(struct amdgpu_device *adev, >>> struct ras_inject_if *info); >>> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct >>> amdgpu_device *adev, >>> int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, >>> struct ras_dispatch_if *info); >>> + >>> +extern atomic_t amdgpu_ras_in_intr; >>> + >>> +static inline bool amdgpu_ras_intr_triggered(void) >>> +{ >>> + return !!atomic_read(&amdgpu_ras_in_intr); >>> +} >>> + >>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); >>> + >>> #endif >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>> index b2c86a0..e7a83f6 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>> @@ -5669,10 +5669,12 @@ static int >>> gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, >>> struct amdgpu_iv_entry *entry) >>> { >>> /* TODO ue will trigger an interrupt. */ >>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> - if (adev->gfx.funcs->query_ras_error_count) >>> - adev->gfx.funcs->query_ras_error_count(adev, err_data); >>> - amdgpu_ras_reset_gpu(adev, 0); >>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { >>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> + if (adev->gfx.funcs->query_ras_error_count) >>> + adev->gfx.funcs->query_ras_error_count(adev, err_data); >>> + amdgpu_ras_reset_gpu(adev, 0); >>> + } >>> return AMDGPU_RAS_SUCCESS; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >>> index 43b4fbc..87a66c2 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >>> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct >>> amdgpu_device *adev, >>> struct ras_err_data *err_data, >>> struct amdgpu_iv_entry *entry) >>> { >>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> - if (adev->umc.funcs->query_ras_error_count) >>> - adev->umc.funcs->query_ras_error_count(adev, err_data); >>> - /* umc query_ras_error_address is also responsible for clearing >>> - * error status >>> - */ >>> - if (adev->umc.funcs->query_ras_error_address) >>> - adev->umc.funcs->query_ras_error_address(adev, err_data); >>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { >>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> + if (adev->umc.funcs->query_ras_error_count) >>> + adev->umc.funcs->query_ras_error_count(adev, err_data); >>> + /* umc query_ras_error_address is also responsible for clearing >>> + * error status >>> + */ >>> + if (adev->umc.funcs->query_ras_error_address) >>> + adev->umc.funcs->query_ras_error_address(adev, err_data); >>> - /* only uncorrectable error needs gpu reset */ >>> - if (err_data->ue_count) >>> - amdgpu_ras_reset_gpu(adev, 0); >>> + /* only uncorrectable error needs gpu reset */ >>> + if (err_data->ue_count) >>> + amdgpu_ras_reset_gpu(adev, 0); >>> + } >>> return AMDGPU_RAS_SUCCESS; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >>> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >>> index 367f9d6..545990c 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c >>> @@ -30,6 +30,7 @@ >>> #include "nbio/nbio_7_4_0_smn.h" >>> #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" >>> #include <uapi/linux/kfd_ioctl.h> >>> +#include "amdgpu_ras.h" >>> #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c >>> @@ -329,6 +330,8 @@ static void >>> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device >>> BIF_DOORBELL_INT_CNTL, >>> RAS_CNTLR_INTERRUPT_CLEAR, 1); >>> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, >>> bif_doorbell_intr_cntl); >>> + >>> + amdgpu_ras_global_ras_isr(adev); >>> } >>> } >>> @@ -344,6 +347,8 @@ static void >>> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d >>> BIF_DOORBELL_INT_CNTL, >>> RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); >>> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, >>> bif_doorbell_intr_cntl); >>> + >>> + amdgpu_ras_global_ras_isr(adev); >>> } >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>> index 956432f..438e504 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>> @@ -1972,24 +1972,26 @@ static int >>> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, >>> uint32_t err_source; >>> int instance; >>> - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); >>> - if (instance < 0) >>> - return 0; >>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { >>> + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); >>> + if (instance < 0) >>> + return 0; >>> - switch (entry->src_id) { >>> - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: >>> - err_source = 0; >>> - break; >>> - case SDMA0_4_0__SRCID__SDMA_ECC: >>> - err_source = 1; >>> - break; >>> - default: >>> - return 0; >>> - } >>> + switch (entry->src_id) { >>> + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: >>> + err_source = 0; >>> + break; >>> + case SDMA0_4_0__SRCID__SDMA_ECC: >>> + err_source = 1; >>> + break; >>> + default: >>> + return 0; >>> + } >>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); >>> - amdgpu_ras_reset_gpu(adev, 0); >>> + amdgpu_ras_reset_gpu(adev, 0); >>> + } >>> return AMDGPU_RAS_SUCCESS; >>> } _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx