[AMD Official Use Only]
Ping...
From: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Sent: Friday, September 24, 2021 4:37 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Joshi, Mukul <Mukul.Joshi@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma (v2) In ras poison mode, page retirement will be handled by the irq handler of the
module which consumes corrupted data. v2: rename ras_process_cb to ras_poison_consumption_handler. move the handler's implementation from ASIC specific file to common file. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 ++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 4 ++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1d41c2c00623..7bc4248a8d49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -31,6 +31,8 @@ #include <linux/dma-buf.h> #include "amdgpu_xgmi.h" #include <uapi/linux/kfd_ioctl.h> +#include "amdgpu_ras.h" +#include "amdgpu_umc.h" /* Total memory size in system memory and all GPU VRAM. Used to * estimate worst case amount of memory to reserve for page tables @@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) return adev->have_atomics_support; } + +int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct ras_err_data err_data = {0, 0, 0, NULL}; + + /* CPU MCA will handle it if connected_to_cpu is 1 */ + if (!adev->gmc.xgmi.connected_to_cpu) + return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); + else + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3bc52b2c604f..d118e1dc273d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, uint64_t *mmap_offset); int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, struct tile_config *config); +int amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd); #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 12d91e53556c..543e7ea75593 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_gpu_reset(dev->kgd); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); return; } break; @@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); } else if (source_id == SOC15_INTSRC_SDMA_ECC) { kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_gpu_reset(dev->kgd); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || -- 2.17.1 |