[AMD Official Use Only - AMD Internal Distribution Only] The series is: Acked-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Sent: Friday, November 15, 2024 4:04 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Deucher, Alexander > <Alexander.Deucher@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 1/2] drm/amdgpu: Add init level for post reset reinit > > When device needs to be reset before initialization, it's not required for all IPs to be > initialized before a reset. In such cases, it needs to identify whether the IP/feature is > initialized for the first time or whether it's reinitialized after a reset. > > Add RESET_RECOVERY init level to identify post reset reinitialization phase. This > only provides a device level identification, IP/features may choose to track their state > independently also. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/aldebaran.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 5 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 ++ > 7 files changed, 37 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c > b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > index 6a2fd9e4f470..57c1ca055388 100644 > --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c > +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c > @@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > } > > list_for_each_entry(tmp_adev, reset_device_list, reset_list) { > + amdgpu_set_init_level(tmp_adev, > + AMDGPU_INIT_LEVEL_RESET_RECOVERY); > dev_info(tmp_adev->dev, > "GPU reset succeeded, trying to resume\n"); > /*TBD: Ideally should clear only GFX, SDMA blocks*/ @@ -377,6 > +379,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control > *reset_ctl, > tmp_adev); > > if (!r) { > + amdgpu_set_init_level(tmp_adev, > + AMDGPU_INIT_LEVEL_DEFAULT); > amdgpu_irq_gpu_reset_resume_helper(tmp_adev); > > r = amdgpu_ib_ring_tests(tmp_adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 4f72ad4e843f..b8ef89d64704 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -846,6 +846,7 @@ struct amdgpu_mqd { > enum amdgpu_init_lvl_id { > AMDGPU_INIT_LEVEL_DEFAULT, > AMDGPU_INIT_LEVEL_MINIMAL_XGMI, > + AMDGPU_INIT_LEVEL_RESET_RECOVERY, > }; > > struct amdgpu_init_level { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 0419b37e75a8..415c469c2d80 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -155,6 +155,11 @@ struct amdgpu_init_level amdgpu_init_default = { > .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, }; > > +struct amdgpu_init_level amdgpu_init_recovery = { > + .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, > + .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, }; > + > /* > * Minimal blocks needed to be initialized before a XGMI hive can be reset. This > * is used for cases like reset on initialization where the entire hive needs to @@ - > 181,6 +186,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev, > case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: > adev->init_lvl = &amdgpu_init_minimal_xgmi; > break; > + case AMDGPU_INIT_LEVEL_RESET_RECOVERY: > + adev->init_lvl = &amdgpu_init_recovery; > + break; > case AMDGPU_INIT_LEVEL_DEFAULT: > fallthrough; > default: > @@ -5445,7 +5453,7 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > struct list_head *device_list_handle; > bool full_reset, vram_lost = false; > struct amdgpu_device *tmp_adev; > - int r; > + int r, init_level; > > device_list_handle = reset_context->reset_device_list; > > @@ -5454,10 +5462,17 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > > full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > > + /** > + * If it's reset on init, it's default init level, otherwise keep level > + * as recovery level. > + */ > + if (reset_context->method == AMD_RESET_METHOD_ON_INIT) > + init_level = AMDGPU_INIT_LEVEL_DEFAULT; > + else > + init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; > r = 0; > list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > - /* After reset, it's default init level */ > - amdgpu_set_init_level(tmp_adev, > AMDGPU_INIT_LEVEL_DEFAULT); > + amdgpu_set_init_level(tmp_adev, init_level); > if (full_reset) { > /* post card */ > amdgpu_ras_clear_err_state(tmp_adev); > @@ -5544,6 +5559,9 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > > out: > if (!r) { > + /* IP init is complete now, set level as default */ > + amdgpu_set_init_level(tmp_adev, > + AMDGPU_INIT_LEVEL_DEFAULT); > amdgpu_irq_gpu_reset_resume_helper(tmp_adev); > r = amdgpu_ib_ring_tests(tmp_adev); > if (r) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index 4fc0ee01d56b..59a29fa12db3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -343,3 +343,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context > *rst_ctxt, char *buf, > strscpy(buf, "unknown", len); > } > } > + > +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev) { > + return (adev->init_lvl->level == > AMDGPU_INIT_LEVEL_RESET_RECOVERY); > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index f8628bc898df..4d9b9701139b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler > xgmi_reset_on_init_handler; int amdgpu_reset_do_xgmi_reset_on_init( > struct amdgpu_reset_context *reset_context); > > +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev); > + > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > index 9b01e074af47..2594467bdd87 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > @@ -220,6 +220,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > int r; > struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl- > >handle; > > + amdgpu_set_init_level(tmp_adev, > AMDGPU_INIT_LEVEL_RESET_RECOVERY); > dev_info(tmp_adev->dev, > "GPU reset succeeded, trying to resume\n"); > r = sienna_cichlid_mode2_restore_ip(tmp_adev); > @@ -237,6 +238,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > > amdgpu_irq_gpu_reset_resume_helper(tmp_adev); > > + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); > r = amdgpu_ib_ring_tests(tmp_adev); > if (r) { > dev_err(tmp_adev->dev, > diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c > b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c > index e70ebad3f9fa..70569ea906bc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c > +++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c > @@ -221,6 +221,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > int r; > struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl- > >handle; > > + amdgpu_set_init_level(tmp_adev, > AMDGPU_INIT_LEVEL_RESET_RECOVERY); > dev_info(tmp_adev->dev, > "GPU reset succeeded, trying to resume\n"); > r = smu_v13_0_10_mode2_restore_ip(tmp_adev); > @@ -234,6 +235,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct > amdgpu_reset_control *reset_ctl, > > amdgpu_irq_gpu_reset_resume_helper(tmp_adev); > > + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); > r = amdgpu_ib_ring_tests(tmp_adev); > if (r) { > dev_err(tmp_adev->dev, > -- > 2.25.1