[AMD Official Use Only - General] The series is fine for me, these patches also need to be reviewed by the virtualization group. Regards, Stanley > -----Original Message----- > From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Sent: Wednesday, August 31, 2022 4:39 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Liu, > Monk <Monk.Liu@xxxxxxx>; Skvortsov, Victor > <Victor.Skvortsov@xxxxxxx>; Chang, HaiJun <HaiJun.Chang@xxxxxxx>; > Chander, Vignesh <Vignesh.Chander@xxxxxxx>; Wan, Gavin > <Gavin.Wan@xxxxxxx>; Liu, Shaoyun <Shaoyun.Liu@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV > > In SRIOV, RAS error injection request will be sent to PF via mailbox, the > injection input information should also be transferred to PF. > > Generally, the error injection is operated on PF side directly, but for RAS > poison test, since workload is launched on VF side, VF has to tell PF about the > injection information. > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 26 ++++++++++++++++------ > -- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 24 > ++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 9 ++++++++ > 4 files changed, 53 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ab9ba5a9c33d..498642eb5fb7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1103,15 +1103,25 @@ int amdgpu_ras_error_inject(struct > amdgpu_device *adev, > block_info.address); > } > > - if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { > - if (block_obj->hw_ops->ras_error_inject) > - ret = block_obj->hw_ops->ras_error_inject(adev, > info); > + if (!amdgpu_sriov_vf(adev)) { > + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { > + if (block_obj->hw_ops->ras_error_inject) > + ret = block_obj->hw_ops- > >ras_error_inject(adev, info); > + } else { > + /* If defined special ras_error_inject(e.g: xgmi), > implement special ras_error_inject */ > + if (block_obj->hw_ops->ras_error_inject) > + ret = block_obj->hw_ops- > >ras_error_inject(adev, &block_info); > + else /*If not defined .ras_error_inject, use default > ras_error_inject*/ > + ret = psp_ras_trigger_error(&adev->psp, > &block_info); > + } > } else { > - /* If defined special ras_error_inject(e.g: xgmi), implement > special ras_error_inject */ > - if (block_obj->hw_ops->ras_error_inject) > - ret = block_obj->hw_ops->ras_error_inject(adev, > &block_info); > - else /*If not defined .ras_error_inject, use default > ras_error_inject*/ > - ret = psp_ras_trigger_error(&adev->psp, > &block_info); > + if (adev->virt.ops && adev->virt.ops->ras_trigger_error) { > + adev->virt.ops->ras_trigger_error(adev, &block_info); > + ret = 0; > + } else { > + dev_warn(adev->dev, > + "No ras_trigger_error interface in SRIOV!\n"); > + } > } > > if (ret) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index 239f232f9c02..4534e6f70a4b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -84,6 +84,8 @@ struct amdgpu_virt_ops { > int (*reset_gpu)(struct amdgpu_device *adev); > int (*wait_reset)(struct amdgpu_device *adev); > void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, > u32 data2, u32 data3); > + void (*ras_trigger_error)(struct amdgpu_device *adev, > + struct ta_ras_trigger_error_input *info); > }; > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index a2f04b249132..3b4c5162a237 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -33,6 +33,7 @@ > #include "mxgpu_ai.h" > > #include "amdgpu_reset.h" > +#include "ta_ras_if.h" > > static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) { @@ - > 405,6 +406,28 @@ static int xgpu_ai_request_init_data(struct > amdgpu_device *adev) > return xgpu_ai_send_access_requests(adev, > IDH_REQ_GPU_INIT_DATA); } > > +void amdgpu_virt_ras_trigger_error(struct amdgpu_device *adev, > + struct ta_ras_trigger_error_input *info) { > + uint32_t addr_lo, addr_hi, data1 = 0; > + > + addr_lo = lower_32_bits(info->address); > + addr_hi = upper_32_bits(info->address); > + > + /* block id : bits[0:4], inject_error_type : bits[5:8] > + * sub_block_index : bits[9:17], value : bits[18:19] > + */ > + data1 = info->block_id & RAS_BLOCK_ID_MASK; > + data1 |= (info->inject_error_type & RAS_INJECT_ERROR_TYPE_MASK) > << > + RAS_INJECT_ERROR_TYPE_SHIFT; > + data1 |= (info->sub_block_index & RAS_SUB_BLOCK_INDEX_MASK) << > + RAS_SUB_BLOCK_INDEX_SHIFT; > + data1 |= (info->value & RAS_VALUE_MASK) << RAS_VALUE_SHIFT; > + > + xgpu_ai_mailbox_trans_msg(adev, IDH_RAS_ERROR_INJECT, data1, > + addr_lo, addr_hi); > +} > + > const struct amdgpu_virt_ops xgpu_ai_virt_ops = { > .req_full_gpu = xgpu_ai_request_full_gpu_access, > .rel_full_gpu = xgpu_ai_release_full_gpu_access, > @@ -412,4 +435,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = { > .wait_reset = NULL, > .trans_msg = xgpu_ai_mailbox_trans_msg, > .req_init_data = xgpu_ai_request_init_data, > + .ras_trigger_error = amdgpu_virt_ras_trigger_error, > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > index fa7e13e0459e..0841d6632328 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > @@ -29,6 +29,14 @@ > #define AI_MAILBOX_POLL_FLR_TIMEDOUT 10000 > #define AI_MAILBOX_POLL_MSG_REP_MAX 11 > > +#define RAS_BLOCK_ID_MASK 0x1f > +#define RAS_INJECT_ERROR_TYPE_MASK 0xf > +#define RAS_INJECT_ERROR_TYPE_SHIFT 5 > +#define RAS_SUB_BLOCK_INDEX_MASK 0x1ff > +#define RAS_SUB_BLOCK_INDEX_SHIFT 9 > +#define RAS_VALUE_MASK 0x3 > +#define RAS_VALUE_SHIFT 18 > + > enum idh_request { > IDH_REQ_GPU_INIT_ACCESS = 1, > IDH_REL_GPU_INIT_ACCESS, > @@ -39,6 +47,7 @@ enum idh_request { > > IDH_LOG_VF_ERROR = 200, > IDH_READY_TO_RESET = 201, > + IDH_RAS_ERROR_INJECT = 202, > }; > > enum idh_event { > -- > 2.35.1