On Mon, Feb 17, 2025 at 9:18 AM SRINIVASAN SHANMUGAM <srinivasan.shanmugam@xxxxxxx> wrote: > > > On 2/17/2025 7:44 PM, Alex Deucher wrote: > > On Sat, Feb 15, 2025 at 3:02 AM SRINIVASAN SHANMUGAM > > <srinivasan.shanmugam@xxxxxxx> wrote: > >> > >> On 2/14/2025 11:05 PM, Alex Deucher wrote: > >> > >> Re-send the mes message on resume to make sure the > >> mes state is up to date. > >> > >> Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES") > >> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> > >> Cc: Shaoyun Liu <shaoyun.liu@xxxxxxx> > >> Cc: Srinivasan Shanmugam <srinivasan.shanmugam@xxxxxxx> > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 13 ++++--------- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 +++++++++++++++++++- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 2 +- > >> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 4 ++++ > >> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 4 ++++ > >> 5 files changed, 32 insertions(+), 11 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > >> index b9bd6654f3172..a194bf3347cbc 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > >> @@ -1665,24 +1665,19 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, > >> } > >> > >> mutex_lock(&adev->enforce_isolation_mutex); > >> - > >> for (i = 0; i < num_partitions; i++) { > >> - if (adev->enforce_isolation[i] && !partition_values[i]) { > >> + if (adev->enforce_isolation[i] && !partition_values[i]) > >> /* Going from enabled to disabled */ > >> amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); > >> - if (adev->enable_mes && adev->gfx.enable_cleaner_shader) > >> - amdgpu_mes_set_enforce_isolation(adev, i, false); > >> - } else if (!adev->enforce_isolation[i] && partition_values[i]) { > >> + else if (!adev->enforce_isolation[i] && partition_values[i]) > >> /* Going from disabled to enabled */ > >> amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); > >> - if (adev->enable_mes && adev->gfx.enable_cleaner_shader) > >> - amdgpu_mes_set_enforce_isolation(adev, i, true); > >> - } > >> adev->enforce_isolation[i] = partition_values[i]; > >> } > >> - > >> mutex_unlock(&adev->enforce_isolation_mutex); > >> > >> + amdgpu_mes_update_enforce_isolation(adev); > >> + > >> return count; > >> } > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > >> index cee38bb6cfaf2..ca076306adba4 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > >> @@ -1508,7 +1508,8 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) > >> } > >> > >> /* Fix me -- node_id is used to identify the correct MES instances in the future */ > >> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) > >> +static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, > >> + uint32_t node_id, bool enable) > >> { > >> struct mes_misc_op_input op_input = {0}; > >> int r; > >> @@ -1530,6 +1531,23 @@ int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_i > >> return r; > >> } > >> > >> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev) > >> +{ > >> + int i, r = 0; > >> + > >> + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) { > >> + mutex_lock(&adev->enforce_isolation_mutex); > >> + for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) { > >> + if (adev->enforce_isolation[i]) > >> + r |= amdgpu_mes_set_enforce_isolation(adev, i, true); > >> + else > >> + r |= amdgpu_mes_set_enforce_isolation(adev, i, false); > >> + } > >> + mutex_unlock(&adev->enforce_isolation_mutex); > >> + } > >> + return r; > >> +} > >> + > >> #if defined(CONFIG_DEBUG_FS) > >> > >> static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > >> index 6a792ffc81e33..3a65c3788956d 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > >> @@ -532,6 +532,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) > >> > >> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); > >> > >> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable); > >> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); > >> > >> #endif /* __AMDGPU_MES_H__ */ > >> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > >> index 530371e6a7aee..fc7b17463cb4d 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > >> @@ -1660,6 +1660,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) > >> goto failure; > >> } > >> > >> + r = amdgpu_mes_update_enforce_isolation(adev); > >> + if (r) > >> + goto failure; > >> + > >> > >> Hi Alex, > >> > >> Should this also be moved to mes_v11_0_hw_init. Please let me know your thoughts? > > I'm not sure I follow. This is in hw_init. > > > > Alex > > Sorry, my mistake mes_v11_0_sw_init pls? There's no need to call it in sw_init, plus the hw is not set up in sw_init so you can't call it there anyway. The whole point of this is to update the firmware with the current sw state after a suspend or reset. Alex > > Thanks! > > Srini > > >> out: > >> /* > >> * Disable KIQ ring usage from the driver once MES is enabled. > >> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > >> index 6db88584dd529..ec91c78468f30 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > >> @@ -1773,6 +1773,10 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) > >> goto failure; > >> } > >> > >> + r = amdgpu_mes_update_enforce_isolation(adev); > >> + if (r) > >> + goto failure; > >> + > >> > >> And Similarly here also? > >> > >> Thanks! > >> > >> Srini > >> > >> out: > >> /* > >> * Disable KIQ ring usage from the driver once MES is enabled.