[AMD Official Use Only - General] Reviewed-by: Emily Deng <Emily.Deng@xxxxxxx> Emily Deng Best Wishes >-----Original Message----- >From: Li, Yunxiang (Teddy) <Yunxiang.Li@xxxxxxx> >Sent: Saturday, April 27, 2024 2:27 AM >To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx >Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian ><Christian.Koenig@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Kuehling, >Felix <Felix.Kuehling@xxxxxxx>; Deng, Emily <Emily.Deng@xxxxxxx>; Li, >Yunxiang (Teddy) <Yunxiang.Li@xxxxxxx> >Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR > >There are other reset sources that pass NULL as the job pointer, such as >amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the >FLR comes from the host does not work. > >Add a flag in reset_context to explicitly mark host triggered reset, and set >this flag when we receive host reset notification. > >Signed-off-by: Yunxiang Li <Yunxiang.Li@xxxxxxx> >--- >v2: fix typo >v3: pass reset_context directly >v4: clear the flag in case we retry > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++----- >drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 + > 5 files changed, 12 insertions(+), 5 deletions(-) > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >index 8befd10bf007..33c889c027a5 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct >amdgpu_device *adev) > * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf > * > * @adev: amdgpu_device pointer >- * @from_hypervisor: request from hypervisor >+ * @reset_context: amdgpu reset context pointer > * > * do VF FLR and reinitialize Asic > * return 0 means succeeded otherwise failed > */ > static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, >- bool from_hypervisor) >+ struct amdgpu_reset_context >*reset_context) > { > int r; > struct amdgpu_hive_info *hive = NULL; >@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct >amdgpu_device *adev, > retry: > amdgpu_amdkfd_pre_reset(adev); > >- if (from_hypervisor) >+ if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { >+ clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); > r = amdgpu_virt_request_full_gpu(adev, true); >- else >+ } else { > r = amdgpu_virt_reset_gpu(adev); >+ } > if (r) > return r; >+ > amdgpu_ras_set_fed(adev, false); > amdgpu_irq_gpu_reset_resume_helper(adev); > >@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct >amdgpu_device *adev, > /* Actual ASIC resets if needed.*/ > /* Host driver will handle XGMI hive reset for SRIOV */ > if (amdgpu_sriov_vf(adev)) { >- r = amdgpu_device_reset_sriov(adev, job ? false : true); >+ r = amdgpu_device_reset_sriov(adev, reset_context); > if (r) > adev->asic_reset_res = r; > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >index b11d190ece53..5a9cc043b858 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { > AMDGPU_NEED_FULL_RESET = 0, > AMDGPU_SKIP_HW_RESET = 1, > AMDGPU_SKIP_COREDUMP = 2, >+ AMDGPU_HOST_FLR = 3, > }; > > struct amdgpu_reset_context { >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >index c5ba9c4757a8..f4c47492e0cd 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct >work_struct *work) > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); >+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > } >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >index fa9d1b02f391..14cc7910e5cf 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct >work_struct *work) > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); >+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > } >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >index 14a065516ae4..78cd07744ebe 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct >work_struct *work) > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); >+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > } >-- >2.34.1