On 8/8/2024 10:56 PM, Victor Skvortsov wrote: > VFs do not perform HW fini/suspend in FLR, so the dpm_enabled > is incorrectly kept enabled. Add interface to disable it in > virt_pre_reset call. > > Signed-off-by: Victor Skvortsov <victor.skvortsov@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 +++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + > .../gpu/drm/amd/include/kgd_pp_interface.h | 1 + > .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 21 +++++++++++++++++++ > 5 files changed, 37 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 730dae77570c..1be5699f4190 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5288,10 +5288,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > if (reset_context->reset_req_dev == adev) > job = reset_context->job; > > - if (amdgpu_sriov_vf(adev)) { > - /* stop the data exchange thread */ > - amdgpu_virt_fini_data_exchange(adev); > - } > + if (amdgpu_sriov_vf(adev)) > + amdgpu_virt_pre_reset(adev); > > amdgpu_fence_driver_isr_toggle(adev, true); > > @@ -5561,6 +5559,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > > static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) > { > + if (amdgpu_sriov_vf(adev)) { > + adev->mp1_state = PP_MP1_STATE_FLR; > + return; > + } Better remove this change. If at all this state need to be persisted through out the reset, handle it only through amdgpu_dpm_set_mp1_state. For now, I don't see a reason to store this state, we only need this state as a trigger for the action associated with this. > > switch (amdgpu_asic_reset_method(adev)) { > case AMD_RESET_METHOD_MODE1: > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index 111c380f929b..456a685c3975 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -33,6 +33,7 @@ > #include "amdgpu.h" > #include "amdgpu_ras.h" > #include "amdgpu_reset.h" > +#include "amdgpu_dpm.h" > #include "vi.h" > #include "soc15.h" > #include "nv.h" > @@ -849,6 +850,13 @@ enum amdgpu_sriov_vf_mode amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *ad > return mode; > } > > +void amdgpu_virt_pre_reset(struct amdgpu_device *adev) > +{ > + /* stop the data exchange thread */ > + amdgpu_virt_fini_data_exchange(adev); > + amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR); > +} > + > void amdgpu_virt_post_reset(struct amdgpu_device *adev) > { > if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index b42a8854dca0..b650a2032c42 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -376,6 +376,7 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev, > u32 offset, u32 acc_flags, u32 hwip, u32 xcc_id); > bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, > uint32_t ucode_id); > +void amdgpu_virt_pre_reset(struct amdgpu_device *adev); > void amdgpu_virt_post_reset(struct amdgpu_device *adev); > bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev); > bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h > index 4b20e2274313..19a48d98830a 100644 > --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h > @@ -218,6 +218,7 @@ enum pp_mp1_state { > PP_MP1_STATE_SHUTDOWN, > PP_MP1_STATE_UNLOAD, > PP_MP1_STATE_RESET, > + PP_MP1_STATE_FLR, > }; > > enum pp_df_cstate { > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > index 78c3f94bb3ff..b85478b1eaa7 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > @@ -2638,6 +2638,26 @@ static int smu_v13_0_6_send_rma_reason(struct smu_context *smu) > return ret; > } > > +static int smu_v13_0_6_set_mp1_state(struct smu_context *smu, > + enum pp_mp1_state mp1_state) > +{ > + int ret =0; > + > + switch (mp1_state) { > + case PP_MP1_STATE_FLR: > + /* VF lost access to SMU */ > + smu->adev->pm.dpm_enabled = false; > + ret = 0; > + break; > + default: > + /* Ignore others */ > + ret = 0; > + } > + > + return ret; > +} Should this be made generic for all SOCs - i.e., handle it amdgpu_dpm_set_mp1_state() itself? The logic remains the same - turn off dpm flag during VF reset. Thanks, Lijo > + > + > static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) > { > struct smu_context *smu = adev->powerplay.pp_handle; > @@ -3283,6 +3303,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { > .i2c_fini = smu_v13_0_6_i2c_control_fini, > .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num, > .send_rma_reason = smu_v13_0_6_send_rma_reason, > + .set_mp1_state = smu_v13_0_6_set_mp1_state, > }; > > void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)