[AMD Official Use Only - General] Thanks for catching. Thanks, Victor -----Original Message----- From: Quan, Evan <Evan.Quan@xxxxxxx> Sent: Friday, July 29, 2022 2:06 PM To: Zhao, Victor <Victor.Zhao@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Deng, Emily <Emily.Deng@xxxxxxx>; Zhao, Victor <Victor.Zhao@xxxxxxx>; Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx> Subject: RE: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid [AMD Official Use Only - General] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > Victor Zhao > Sent: Thursday, July 28, 2022 6:30 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deng, Emily <Emily.Deng@xxxxxxx>; Zhao, Victor > <Victor.Zhao@xxxxxxx>; Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx> > Subject: [PATCH v2 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid > > To meet the requirement for multi container usecase which needs a > quicker reset and not causing VRAM lost, adding the Mode2 reset > handler for sienna_cichlid. > > v2: move skip mode2 flag part separately > > Signed-off-by: Victor Zhao <Victor.Zhao@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 7 + > drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 297 > ++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h | 32 ++ > .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h | 4 +- > drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 +- > .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 54 ++++ > 7 files changed, 395 insertions(+), 4 deletions(-) create mode > 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h > > diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile > b/drivers/gpu/drm/amd/amdgpu/Makefile > index c7d0cd15b5ef..7030ac2d7d2c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/Makefile > +++ b/drivers/gpu/drm/amd/amdgpu/Makefile > @@ -75,7 +75,7 @@ amdgpu-y += \ > vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o > vega10_reg_init.o \ > vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o > mxgpu_nv.o \ > nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o > soc21.o \ > - nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o > + sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o > lsdma_v6_0.o > > # add DF block > amdgpu-y += \ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index 32c86a0b145c..f778466bb9db 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -23,6 +23,7 @@ > > #include "amdgpu_reset.h" > #include "aldebaran.h" > +#include "sienna_cichlid.h" > > int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl, > struct amdgpu_reset_handler *handler) @@ -40,6 +41,9 @@ int > amdgpu_reset_init(struct amdgpu_device *adev) > case IP_VERSION(13, 0, 2): > ret = aldebaran_reset_init(adev); > break; > + case IP_VERSION(11, 0, 7): > + ret = sienna_cichlid_reset_init(adev); > + break; > default: > break; > } > @@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev) > case IP_VERSION(13, 0, 2): > ret = aldebaran_reset_fini(adev); > break; > + case IP_VERSION(11, 0, 7): > + ret = sienna_cichlid_reset_fini(adev); > + break; > default: > break; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > new file mode 100644 > index 000000000000..0512960bed23 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c > @@ -0,0 +1,297 @@ > +/* > + * Copyright 2021 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person > +obtaining a > + * copy of this software and associated documentation files (the > "Software"), > + * to deal in the Software without restriction, including without > + limitation > + * the rights to use, copy, modify, merge, publish, distribute, > + sublicense, > + * and/or sell copies of the Software, and to permit persons to whom > + the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > + included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, > DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR > THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + * > + */ > + > +#include "sienna_cichlid.h" > +#include "amdgpu_reset.h" > +#include "amdgpu_amdkfd.h" > +#include "amdgpu_dpm.h" > +#include "amdgpu_job.h" > +#include "amdgpu_ring.h" > +#include "amdgpu_ras.h" > +#include "amdgpu_psp.h" > +#include "amdgpu_xgmi.h" > + > +static struct amdgpu_reset_handler * > +sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl, > + struct amdgpu_reset_context *reset_context) { > + struct amdgpu_reset_handler *handler; > + struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl- > >handle; > + > + if (reset_context->method != AMD_RESET_METHOD_NONE) { > + list_for_each_entry(handler, &reset_ctl->reset_handlers, > + handler_list) { > + if (handler->reset_method == reset_context- > >method) > + return handler; > + } > + } else { > + list_for_each_entry(handler, &reset_ctl->reset_handlers, > + handler_list) { > + if (handler->reset_method == > AMD_RESET_METHOD_MODE2 && > + adev->pm.fw_version >= 0x3a5500 && > + !amdgpu_sriov_vf(adev)) { > + reset_context->method = > AMD_RESET_METHOD_MODE2; > + return handler; > + } > + } > + } > + > + return NULL; > +} > + > +static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device > +*adev) { > + int r, i; > + > + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); > + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); > + > + for (i = adev->num_ip_blocks - 1; i >= 0; i--) { > + if (!(adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_GFX || > + adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_SDMA)) > + continue; > + > + r = adev->ip_blocks[i].version->funcs->suspend(adev); > + > + if (r) { > + dev_err(adev->dev, > + "suspend of IP block <%s> failed %d\n", > + adev->ip_blocks[i].version->funcs->name, r); > + return r; > + } > + adev->ip_blocks[i].status.hw = false; > + } > + > + return r; > +} > + > +static int > +sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control > *reset_ctl, > + struct amdgpu_reset_context > *reset_context) > +{ > + int r = 0; > + struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl- > >handle; > + > + if (!amdgpu_sriov_vf(adev)) > + r = sienna_cichlid_mode2_suspend_ip(adev); > + > + return r; > +} > + > +static void sienna_cichlid_async_reset(struct work_struct *work) { > + struct amdgpu_reset_handler *handler; > + struct amdgpu_reset_control *reset_ctl = > + container_of(work, struct amdgpu_reset_control, > reset_work); > + struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl- > >handle; > + > + list_for_each_entry(handler, &reset_ctl->reset_handlers, > + handler_list) { > + if (handler->reset_method == reset_ctl->active_reset) { > + dev_dbg(adev->dev, "Resetting device\n"); > + handler->do_reset(adev); > + break; > + } > + } > +} > + > +static int sienna_cichlid_mode2_reset(struct amdgpu_device *adev) { > + /* disable BM */ > + pci_clear_master(adev->pdev); > + adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev); > + return adev->asic_reset_res; > +} > + > +static int > +sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control > *reset_ctl, > + struct amdgpu_reset_context *reset_context) { > + struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl- > >handle; > + int r; > + > + r = sienna_cichlid_mode2_reset(adev); > + if (r) { > + dev_err(adev->dev, > + "ASIC reset failed with error, %d ", r); > + } > + return r; > +} > + > +static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device > +*adev) { > + int i, r; > + struct psp_context *psp = &adev->psp; > + > + r = psp_rlc_autoload_start(psp); > + if (r) { > + dev_err(adev->dev, "Failed to start rlc autoload\n"); > + return r; > + } > + > + /* Reinit GFXHUB */ > + adev->gfxhub.funcs->init(adev); > + r = adev->gfxhub.funcs->gart_enable(adev); > + if (r) { > + dev_err(adev->dev, "GFXHUB gart reenable failed after > reset\n"); > + return r; > + } > + > + for (i = 0; i < adev->num_ip_blocks; i++) { > + if (adev->ip_blocks[i].version->type == > AMD_IP_BLOCK_TYPE_IH) > + r = adev->ip_blocks[i].version->funcs- > >resume(adev); > + if (r) { > + dev_err(adev->dev, > + "resume of IP block <%s> failed %d\n", > + adev->ip_blocks[i].version->funcs->name, r); > + return r; > + } > + > + adev->ip_blocks[i].status.hw = true; [Quan, Evan] It seems above operation will set the hw.status to true for all IPs. Although actually the resuming only performed on IH. That seems an issue. BR Evan > + } > + > + for (i = 0; i < adev->num_ip_blocks; i++) { > + if (!(adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_GFX || > + adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_SDMA)) > + continue; > + r = adev->ip_blocks[i].version->funcs->resume(adev); > + if (r) { > + dev_err(adev->dev, > + "resume of IP block <%s> failed %d\n", > + adev->ip_blocks[i].version->funcs->name, r); > + return r; > + } > + > + adev->ip_blocks[i].status.hw = true; > + } > + > + for (i = 0; i < adev->num_ip_blocks; i++) { > + if (!(adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_GFX || > + adev->ip_blocks[i].version->type == > + AMD_IP_BLOCK_TYPE_SDMA)) > + continue; > + > + if (adev->ip_blocks[i].version->funcs->late_init) { > + r = adev->ip_blocks[i].version->funcs->late_init( > + (void *)adev); > + if (r) { > + dev_err(adev->dev, > + "late_init of IP block <%s> failed %d > after reset\n", > + adev->ip_blocks[i].version->funcs- > >name, > + r); > + return r; > + } > + } > + adev->ip_blocks[i].status.late_initialized = true; > + } > + > + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); > + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); > + > + return r; > +} > + > +static int > +sienna_cichlid_mode2_restore_hwcontext(struct amdgpu_reset_control > *reset_ctl, > + struct amdgpu_reset_context > *reset_context) > +{ > + int r; > + struct amdgpu_device *tmp_adev = (struct amdgpu_device > *)reset_ctl->handle; > + > + dev_info(tmp_adev->dev, > + "GPU reset succeeded, trying to resume\n"); > + r = sienna_cichlid_mode2_restore_ip(tmp_adev); > + if (r) > + goto end; > + > + /* > + * Add this ASIC as tracked as reset was already > + * complete successfully. > + */ > + amdgpu_register_gpu_instance(tmp_adev); > + > + /* Resume RAS */ > + amdgpu_ras_resume(tmp_adev); > + > + amdgpu_irq_gpu_reset_resume_helper(tmp_adev); > + > + r = amdgpu_ib_ring_tests(tmp_adev); > + if (r) { > + dev_err(tmp_adev->dev, > + "ib ring test failed (%d).\n", r); > + r = -EAGAIN; > + tmp_adev->asic_reset_res = r; > + goto end; > + } > + > +end: > + if (r) > + return -EAGAIN; > + else > + return r; > +} > + > +static struct amdgpu_reset_handler sienna_cichlid_mode2_handler = { > + .reset_method = AMD_RESET_METHOD_MODE2, > + .prepare_env = NULL, > + .prepare_hwcontext = sienna_cichlid_mode2_prepare_hwcontext, > + .perform_reset = > sienna_cichlid_mode2_perform_reset, > + .restore_hwcontext = sienna_cichlid_mode2_restore_hwcontext, > + .restore_env = NULL, > + .do_reset = sienna_cichlid_mode2_reset, > +}; > + > +int sienna_cichlid_reset_init(struct amdgpu_device *adev) { > + struct amdgpu_reset_control *reset_ctl; > + > + reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL); > + if (!reset_ctl) > + return -ENOMEM; > + > + reset_ctl->handle = adev; > + reset_ctl->async_reset = sienna_cichlid_async_reset; > + reset_ctl->active_reset = AMD_RESET_METHOD_NONE; > + reset_ctl->get_reset_handler = sienna_cichlid_get_reset_handler; > + > + INIT_LIST_HEAD(&reset_ctl->reset_handlers); > + INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset); > + /* Only mode2 is handled through reset control now */ > + amdgpu_reset_add_handler(reset_ctl, > &sienna_cichlid_mode2_handler); > + > + adev->reset_cntl = reset_ctl; > + > + return 0; > +} > + > +int sienna_cichlid_reset_fini(struct amdgpu_device *adev) { > + kfree(adev->reset_cntl); > + adev->reset_cntl = NULL; > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h > b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h > new file mode 100644 > index 000000000000..5213b162dacd > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h > @@ -0,0 +1,32 @@ > +/* > + * Copyright 2021 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person > +obtaining a > + * copy of this software and associated documentation files (the > "Software"), > + * to deal in the Software without restriction, including without > + limitation > + * the rights to use, copy, modify, merge, publish, distribute, > + sublicense, > + * and/or sell copies of the Software, and to permit persons to whom > + the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > + included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, > DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR > THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + * > + */ > + > +#ifndef __SIENNA_CICHLID_H__ > +#define __SIENNA_CICHLID_H__ > + > +#include "amdgpu.h" > + > +int sienna_cichlid_reset_init(struct amdgpu_device *adev); int > +sienna_cichlid_reset_fini(struct amdgpu_device *adev); > + > +#endif > diff --git > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h > index d2e10a724560..82cf9e563065 100644 > --- > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h > +++ > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h > @@ -137,7 +137,7 @@ > #define PPSMC_MSG_DisallowGpo 0x56 > > #define PPSMC_MSG_Enable2ndUSB20Port 0x57 > - > -#define PPSMC_Message_Count 0x58 > +#define PPSMC_MSG_DriverMode2Reset 0x5D > +#define PPSMC_Message_Count 0x5E > > #endif > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > index 19084a4fcb2b..28f6a1eb6945 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h > @@ -235,7 +235,8 @@ > __SMU_DUMMY_MAP(UnforceGfxVid), \ > __SMU_DUMMY_MAP(HeavySBR), \ > __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel), \ > - __SMU_DUMMY_MAP(EnableGfxImu), > + __SMU_DUMMY_MAP(EnableGfxImu), \ > + __SMU_DUMMY_MAP(DriverMode2Reset), > > #undef __SMU_DUMMY_MAP > #define __SMU_DUMMY_MAP(type) SMU_MSG_##type > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c > index fa520d79ef67..a73d241bb64f 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c > @@ -154,6 +154,7 @@ static struct cmn2asic_msg_mapping > sienna_cichlid_message_map[SMU_MSG_MAX_COUNT] > MSG_MAP(SetGpoFeaturePMask, > PPSMC_MSG_SetGpoFeaturePMask, 0), > MSG_MAP(DisallowGpo, > PPSMC_MSG_DisallowGpo, 0), > MSG_MAP(Enable2ndUSB20Port, > PPSMC_MSG_Enable2ndUSB20Port, 0), > + MSG_MAP(DriverMode2Reset, > PPSMC_MSG_DriverMode2Reset, 0), > }; > > static struct cmn2asic_mapping sienna_cichlid_clk_map[SMU_CLK_COUNT] > = { > @@ -4254,6 +4255,57 @@ static int > sienna_cichlid_stb_get_data_direct(struct smu_context *smu, > return 0; > } > > +static bool sienna_cichlid_is_mode2_reset_supported(struct > +smu_context > *smu) > +{ > + return true; > +} > + > +static int sienna_cichlid_mode2_reset(struct smu_context *smu) { > + u32 smu_version; > + int ret = 0, index; > + struct amdgpu_device *adev = smu->adev; > + int timeout = 100; > + > + smu_cmn_get_smc_version(smu, NULL, &smu_version); > + > + index = smu_cmn_to_asic_specific_index(smu, > CMN2ASIC_MAPPING_MSG, > + > SMU_MSG_DriverMode2Reset); > + > + mutex_lock(&smu->message_lock); > + > + ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, > + SMU_RESET_MODE_2); > + > + ret = smu_cmn_wait_for_response(smu); > + while (ret != 0 && timeout) { > + ret = smu_cmn_wait_for_response(smu); > + /* Wait a bit more time for getting ACK */ > + if (ret != 0) { > + --timeout; > + usleep_range(500, 1000); > + continue; > + } else { > + break; > + } > + } > + > + if (!timeout) { > + dev_err(adev->dev, > + "failed to send mode2 message \tparam: 0x%08x > response %#x\n", > + SMU_RESET_MODE_2, ret); > + goto out; > + } > + > + dev_info(smu->adev->dev, "restore config space...\n"); > + /* Restore the config space saved during init */ > + amdgpu_device_load_pci_state(adev->pdev); > +out: > + mutex_unlock(&smu->message_lock); > + > + return ret; > +} > + > static const struct pptable_funcs sienna_cichlid_ppt_funcs = { > .get_allowed_feature_mask = > sienna_cichlid_get_allowed_feature_mask, > .set_default_dpm_table = sienna_cichlid_set_default_dpm_table, > @@ -4348,6 +4400,8 @@ static const struct pptable_funcs > sienna_cichlid_ppt_funcs = { > .get_default_config_table_settings = > sienna_cichlid_get_default_config_table_settings, > .set_config_table = sienna_cichlid_set_config_table, > .get_unique_id = sienna_cichlid_get_unique_id, > + .mode2_reset_is_support = > sienna_cichlid_is_mode2_reset_supported, > + .mode2_reset = sienna_cichlid_mode2_reset, > }; > > void sienna_cichlid_set_ppt_funcs(struct smu_context *smu) > -- > 2.25.1