On 2019-08-28 4:00 p.m., Andrey Grodzovsky wrote: > Problem: > Under certain conditions, when some IP bocks take a RAS error, > we can get into a situation where a GPU reset is not possible > due to issues in RAS in SMU/PSP. > > Temporary fix until proper solution in PSP/SMU is ready: > When uncorrectable error happens the DF will unconditionally > broadcast error event packets to all its clients/slave upon > receiving fatal error event and freeze all its outbound queues, > err_event_athub interrupt will be triggered. > In such case and we use this interrupt > to issue GPU reset. THe GPU reset code is modified for such case to avoid HW > reset, only stops schedulers, deatches all in progress and not yet scheduled > job's fences, set error code on them and signals. > Also reject any new incoming job submissions from user space. > All this is done to notify the applications of the problem. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 ++++++++++++++++++++++-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +-- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++---- > drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++ > drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++----- > 10 files changed, 164 insertions(+), 62 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 9da681e..300adb8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -38,6 +38,7 @@ > #include "amdgpu_gmc.h" > #include "amdgpu_gem.h" > #include "amdgpu_display.h" > +#include "amdgpu_ras.h" > > #if defined(HAVE_DRM_FREE_LARGE) > #define kvfree drm_free_large > @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > bool reserved_buffers = false; > int i, r; > > + if (amdgpu_ras_intr_triggered()) > + return -EHWPOISON; > + > if (!adev->accel_working) > return -EBUSY; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 07a4ba0..3ecee10 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) > return true; > } > > -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) > +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool skip_kfd) > { > /*unlock kfd: SRIOV would do it separately */ > - if (!amdgpu_sriov_vf(adev)) > + if (!amdgpu_sriov_vf(adev) && !skip_kfd) > amdgpu_amdkfd_post_reset(adev); > amdgpu_vf_error_trans_all(adev); > adev->mp1_state = PP_MP1_STATE_NONE; > @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) > } > > > +#define to_drm_sched_job(sched_job) \ > + container_of((sched_job), struct drm_sched_job, queue_node) > + > +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) > +{ > + struct drm_sched_job *s_job; > + struct drm_sched_entity *s_entity = NULL; > + int i; > + > + /* Signal all jobs not yet scheduled */ > + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { > + struct drm_sched_rq *rq = &sched->sched_rq[i]; > + > + if (!rq) > + continue; > + > + spin_lock(&rq->lock); > + list_for_each_entry(s_entity, &rq->entities, list) { > + while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { > + struct drm_sched_fence *s_fence = s_job->s_fence; > + > + dma_fence_signal(&s_fence->scheduled); > + dma_fence_set_error(&s_fence->finished, -EHWPOISON); > + dma_fence_signal(&s_fence->finished); > + } > + } > + spin_unlock(&rq->lock); > + } > + > + /* Signal all jobs already scheduled to HW */ > + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { > + struct drm_sched_fence *s_fence = s_job->s_fence; > + > + dma_fence_set_error(&s_fence->finished, -EHWPOISON); > + dma_fence_signal(&s_fence->finished); > + } > +} > + > /** > * amdgpu_device_gpu_recover - reset the asic and recover scheduler > * > @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > struct amdgpu_hive_info *hive = NULL; > struct amdgpu_device *tmp_adev = NULL; > int i, r = 0; > + bool in_ras_intr = amdgpu_ras_intr_triggered(); > > need_full_reset = job_signaled = false; > INIT_LIST_HEAD(&device_list); > > - dev_info(adev->dev, "GPU reset begin!\n"); > + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); > > cancel_delayed_work_sync(&adev->delayed_init_work); > > @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > /* Build list of devices to reset */ > if (adev->gmc.xgmi.num_physical_nodes > 1) { > if (!hive) { > - amdgpu_device_unlock_adev(adev); > + amdgpu_device_unlock_adev(adev, false); Is there a reason why you're not using in_ras_intr here to control KFD post-reset? Regards, Felix > return -ENODEV; > } > > @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > /* block all schedulers and reset given job's ring */ > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { > /* disable ras on ALL IPs */ > - if (amdgpu_device_ip_need_full_reset(tmp_adev)) > + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) > amdgpu_ras_suspend(tmp_adev); > > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > @@ -3834,10 +3873,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > continue; > > drm_sched_stop(&ring->sched, job ? &job->base : NULL); > + > + if (in_ras_intr) > + amdgpu_stop_all_jobs_on_sched(&ring->sched); > } > } > > > + if (in_ras_intr) > + goto skip_hw_reset; > + > /* > * Must check guilty signal here since after this point all old > * HW fences are force signaled. > @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > > /* Post ASIC reset for all devs .*/ > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { > - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > - struct amdgpu_ring *ring = tmp_adev->rings[i]; > > - if (!ring || !ring->sched.thread) > - continue; > + if (!in_ras_intr) { > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = tmp_adev->rings[i]; > > - /* No point to resubmit jobs if we didn't HW reset*/ > - if (!tmp_adev->asic_reset_res && !job_signaled) > - drm_sched_resubmit_jobs(&ring->sched); > + if (!ring || !ring->sched.thread) > + continue; > > - drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); > - } > + /* No point to resubmit jobs if we didn't HW reset*/ > + if (!tmp_adev->asic_reset_res && !job_signaled) > + drm_sched_resubmit_jobs(&ring->sched); > > - if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { > - drm_helper_resume_force_mode(tmp_adev->ddev); > - } > + drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); > + } > > - tmp_adev->asic_reset_res = 0; > + if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { > + drm_helper_resume_force_mode(tmp_adev->ddev); > + } > > - if (r) { > - /* bad news, how to tell it to userspace ? */ > - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); > - amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); > - } else { > - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); > + tmp_adev->asic_reset_res = 0; > + > + if (r) { > + /* bad news, how to tell it to userspace ? */ > + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); > + amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); > + } else { > + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); > + } > } > > - amdgpu_device_unlock_adev(tmp_adev); > + amdgpu_device_unlock_adev(tmp_adev, in_ras_intr); > } > > if (hive) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 151d7f2..757fd6d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -40,6 +40,8 @@ > > #include "amdgpu_amdkfd.h" > > +#include "amdgpu_ras.h" > + > /* > * KMS wrapper. > * - 3.0.0 - initial driver > @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) > struct drm_device *dev = pci_get_drvdata(pdev); > struct amdgpu_device *adev = dev->dev_private; > > + if (amdgpu_ras_intr_triggered()) > + return; > + > /* if we are running in a VM, make sure the device > * torn down properly on reboot/shutdown. > * unfortunately we can't detect certain > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index da2143d..ced766c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > /* Ensure IB tests are run on ring */ > flush_delayed_work(&adev->delayed_init_work); > > + > + if (amdgpu_ras_intr_triggered()) { > + DRM_ERROR("RAS Intr triggered, device disabled!!"); > + return -EHWPOISON; > + } > + > file_priv->driver_priv = NULL; > > r = pm_runtime_get_sync(dev->dev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2d5897a..086e6df 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -24,6 +24,8 @@ > #include <linux/debugfs.h> > #include <linux/list.h> > #include <linux/module.h> > +#include <linux/reboot.h> > +#include <linux/syscalls.h> > #include "amdgpu.h" > #include "amdgpu_ras.h" > #include "amdgpu_atomfirmware.h" > @@ -64,6 +66,9 @@ const char *ras_block_string[] = { > /* inject address is 52 bits */ > #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) > > + > +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); > + > static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, > uint64_t offset, uint64_t size, > struct amdgpu_bo **bo_ptr); > @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, > ssize_t s; > char val[128]; > > - if (amdgpu_ras_error_query(obj->adev, &info)) > + if (amdgpu_ras_error_query(obj->adev, &info, false)) > return -EINVAL; > > s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", > @@ -188,6 +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, > > return 0; > } > + > +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, > + struct ras_common_if *head); > + > /** > * DOC: AMDGPU RAS debugfs control interface > * > @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, > .head = obj->head, > }; > > - if (amdgpu_ras_error_query(obj->adev, &info)) > + if (amdgpu_ras_error_query(obj->adev, &info, false)) > return -EINVAL; > > return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", > @@ -591,7 +600,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, > > /* query/inject/cure begin */ > int amdgpu_ras_error_query(struct amdgpu_device *adev, > - struct ras_query_if *info) > + struct ras_query_if *info, bool print) > { > struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); > struct ras_err_data err_data = {0, 0, 0, NULL}; > @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, > info->ue_count = obj->err_data.ue_count; > info->ce_count = obj->err_data.ce_count; > > - if (err_data.ce_count) > + if (err_data.ce_count || print) { > dev_info(adev->dev, "%ld correctable errors detected in %s block\n", > obj->err_data.ce_count, ras_block_str(info->head.block)); > - if (err_data.ue_count) > + } > + if (err_data.ue_count || print) { > dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", > obj->err_data.ue_count, ras_block_str(info->head.block)); > + } > > return 0; > } > @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, > .head = obj->head, > }; > > - if (amdgpu_ras_error_query(adev, &info)) > + if (amdgpu_ras_error_query(adev, &info, true)) > return -EINVAL; > > data.ce_count += info.ce_count; > @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) > > return 0; > } > + > +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) > +{ > + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { > + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); > + } > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 5a0df73..c0e22af 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, > struct ras_common_if *head); > > int amdgpu_ras_error_query(struct amdgpu_device *adev, > - struct ras_query_if *info); > + struct ras_query_if *info, bool print); > > int amdgpu_ras_error_inject(struct amdgpu_device *adev, > struct ras_inject_if *info); > @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, > > int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, > struct ras_dispatch_if *info); > + > +extern atomic_t amdgpu_ras_in_intr; > + > +static inline bool amdgpu_ras_intr_triggered(void) > +{ > + return !!atomic_read(&amdgpu_ras_in_intr); > +} > + > +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); > + > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index b2c86a0..e7a83f6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, > struct amdgpu_iv_entry *entry) > { > /* TODO ue will trigger an interrupt. */ > - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > - if (adev->gfx.funcs->query_ras_error_count) > - adev->gfx.funcs->query_ras_error_count(adev, err_data); > - amdgpu_ras_reset_gpu(adev, 0); > + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > + if (adev->gfx.funcs->query_ras_error_count) > + adev->gfx.funcs->query_ras_error_count(adev, err_data); > + amdgpu_ras_reset_gpu(adev, 0); > + } > return AMDGPU_RAS_SUCCESS; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 43b4fbc..87a66c2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, > struct ras_err_data *err_data, > struct amdgpu_iv_entry *entry) > { > - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > - if (adev->umc.funcs->query_ras_error_count) > - adev->umc.funcs->query_ras_error_count(adev, err_data); > - /* umc query_ras_error_address is also responsible for clearing > - * error status > - */ > - if (adev->umc.funcs->query_ras_error_address) > - adev->umc.funcs->query_ras_error_address(adev, err_data); > + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > + if (adev->umc.funcs->query_ras_error_count) > + adev->umc.funcs->query_ras_error_count(adev, err_data); > + /* umc query_ras_error_address is also responsible for clearing > + * error status > + */ > + if (adev->umc.funcs->query_ras_error_address) > + adev->umc.funcs->query_ras_error_address(adev, err_data); > > - /* only uncorrectable error needs gpu reset */ > - if (err_data->ue_count) > - amdgpu_ras_reset_gpu(adev, 0); > + /* only uncorrectable error needs gpu reset */ > + if (err_data->ue_count) > + amdgpu_ras_reset_gpu(adev, 0); > + } > > return AMDGPU_RAS_SUCCESS; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c > index 367f9d6..545990c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c > +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c > @@ -30,6 +30,7 @@ > #include "nbio/nbio_7_4_0_smn.h" > #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > #include <uapi/linux/kfd_ioctl.h> > +#include "amdgpu_ras.h" > > #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c > > @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device > BIF_DOORBELL_INT_CNTL, > RAS_CNTLR_INTERRUPT_CLEAR, 1); > WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); > + > + amdgpu_ras_global_ras_isr(adev); > } > } > > @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d > BIF_DOORBELL_INT_CNTL, > RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); > WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); > + > + amdgpu_ras_global_ras_isr(adev); > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > index 956432f..438e504 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > @@ -1972,24 +1972,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, > uint32_t err_source; > int instance; > > - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); > - if (instance < 0) > - return 0; > + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { > + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); > + if (instance < 0) > + return 0; > > - switch (entry->src_id) { > - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: > - err_source = 0; > - break; > - case SDMA0_4_0__SRCID__SDMA_ECC: > - err_source = 1; > - break; > - default: > - return 0; > - } > + switch (entry->src_id) { > + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: > + err_source = 0; > + break; > + case SDMA0_4_0__SRCID__SDMA_ECC: > + err_source = 1; > + break; > + default: > + return 0; > + } > > - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > > - amdgpu_ras_reset_gpu(adev, 0); > + amdgpu_ras_reset_gpu(adev, 0); > + } > > return AMDGPU_RAS_SUCCESS; > } _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx