From: YiPeng Chai <YiPeng.Chai@xxxxxxx> Support passing poison consumption ras blocks to SRIOV. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 3 ++- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 23 +++++++++++++++---- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 7 ++++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 7 ++++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 7 ++++-- 13 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 77e263660288..dfb93664e866 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev) amdgpu_device_flush_hdp(adev, NULL); } -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, bool reset) { - amdgpu_umc_poison_handler(adev, reset); + amdgpu_umc_poison_handler(adev, block, reset); } int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 584a0cea5572..50d3e0149032 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - bool reset); + enum amdgpu_ras_block block, bool reset); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ebcd1cb60052..79bf6bd428a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * } } - amdgpu_umc_poison_handler(adev, false); + amdgpu_umc_poison_handler(adev, obj->head.block, false); if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index a6cdb69897f2..20436f81856a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, return 0; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, bool reset) { int ret = AMDGPU_RAS_SUCCESS; @@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) - adev->virt.ops->ras_poison_handler(adev); + adev->virt.ops->ras_poison_handler(adev, block); else dev_warn(adev->dev, "No ras_poison_handler interface in SRIOV!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 83199296ed10..26d2ae498daf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -102,7 +102,8 @@ struct amdgpu_umc { int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset); +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, bool reset); int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index f4963330c772..f300d4a4457d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev, amdgpu_ras_interrupt_dispatch(adev, &ih_data); } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) - adev->virt.ops->ras_poison_handler(adev); + adev->virt.ops->ras_poison_handler(adev, ras_if->block); else dev_warn(adev->dev, "No ras_poison_handler interface in SRIOV for VCN!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 1b49c007ff62..fa7be5f277b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -88,7 +88,8 @@ struct amdgpu_virt_ops { int (*wait_reset)(struct amdgpu_device *adev); void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req, u32 data1, u32 data2, u32 data3); - void (*ras_poison_handler)(struct amdgpu_device *adev); + void (*ras_poison_handler)(struct amdgpu_device *adev, + enum amdgpu_ras_block block); }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c index 26d6286d86c9..9e7ce1e6bc06 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c @@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev, amdgpu_ras_interrupt_dispatch(adev, &ih_data); } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) - adev->virt.ops->ras_poison_handler(adev); + adev->virt.ops->ras_poison_handler(adev, ras_if->block); else dev_warn(adev->dev, "No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 63725b2ebc03..a2bd2c3b1ef9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev) return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); } -static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev) +static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block) { xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 6a68ee946f1c..d0a018da3c7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev, xgpu_nv_mailbox_set_valid(adev, false); } -static int xgpu_nv_send_access_requests(struct amdgpu_device *adev, - enum idh_request req) +static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev, + enum idh_request req, u32 data1, u32 data2, u32 data3) { int r, retry = 1; enum idh_event event = -1; send_request: - xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0); + xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3); switch (req) { case IDH_REQ_GPU_INIT_ACCESS: @@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev, return 0; } +static int xgpu_nv_send_access_requests(struct amdgpu_device *adev, + enum idh_request req) +{ + return xgpu_nv_send_access_requests_with_param(adev, + req, 0, 0, 0); +} + static int xgpu_nv_request_reset(struct amdgpu_device *adev) { int ret, i = 0; @@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev) amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); } -static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev) +static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block) { - xgpu_nv_send_access_requests(adev, IDH_RAS_POISON); + if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) { + xgpu_nv_send_access_requests(adev, IDH_RAS_POISON); + } else { + xgpu_nv_send_access_requests_with_param(adev, + IDH_RAS_POISON, block, 0, 0); + } } const struct amdgpu_virt_ops xgpu_nv_virt_ops = { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index a7697ec8188e..9a06c6fb6605 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { + enum amdgpu_ras_block block = 0; int old_poison, ret = -EINVAL; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: ret = kfd_dqm_evict_pasid(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA2: case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: + block = AMDGPU_RAS_BLOCK__SDMA; break; default: break; @@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, dev_warn(dev->adev->dev, "RAS poison consumption, unmap queue flow succeeded: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); } else { dev_warn(dev->adev->dev, "RAS poison consumption, fall back to gpu reset flow: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 2a65792fd116..c6d28e37ed46 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, uint16_t pasid, uint16_t source_id) { + enum amdgpu_ras_block block = 0; int ret = -EINVAL; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, case SOC15_INTSRC_SQ_INTERRUPT_MSG: if (dev->dqm->ops.reset_queues) ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; break; case SOC21_INTSRC_SDMA_ECC: + block = AMDGPU_RAS_BLOCK__SDMA; default: break; } @@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset resetting queue fails, fallback to gpu reset solution */ if (!ret) - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); else - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); } static bool event_interrupt_isr_v11(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 27cdaea40501..91dd5e045b51 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { + enum amdgpu_ras_block block = 0; int old_poison, ret = -EINVAL; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: ret = kfd_dqm_evict_pasid(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA2: case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: + block = AMDGPU_RAS_BLOCK__SDMA; break; default: break; @@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, dev_warn(dev->adev->dev, "RAS poison consumption, unmap queue flow succeeded: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); } else { dev_warn(dev->adev->dev, "RAS poison consumption, fall back to gpu reset flow: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); } } -- 2.25.1