[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Asad Kamal <asad.kamal@xxxxxxx> Thanks & Regards Asad -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Alex Deucher Sent: Thursday, January 16, 2025 8:37 PM To: Lazar, Lijo <Lijo.Lazar@xxxxxxx> Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kamal, Asad <Asad.Kamal@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Subject: Re: [PATCH] drm/amd/pm: Add capability flags for SMU v13.0.6 On Thu, Jan 16, 2025 at 7:29 AM Lijo Lazar <lijo.lazar@xxxxxxx> wrote: > > Add capability flags for SMU v13.0.6 variants. Initialize the flags > based on firmware support. As there are multiple IP versions > maintained, it is more manageable with one time initialization caps > flags based on IP version and firmware feature support. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> This is a nice cleanup. Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 1 + > .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 225 > ++++++++++++------ > 2 files changed, 158 insertions(+), 68 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h > b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h > index 356d9422b411..8d4a96e23326 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h > @@ -107,6 +107,7 @@ struct smu_13_0_dpm_context { > struct smu_13_0_dpm_tables dpm_tables; > uint32_t workload_policy_mask; > uint32_t dcef_min_ds_clk; > + uint64_t caps; > }; > > enum smu_13_0_power_state { > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > index c12959a36d78..56e26fcd3066 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c > @@ -101,38 +101,25 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_14.bin"); > #define MCA_BANK_IPID(_ip, _hwid, _type) \ > [AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, } > > -static inline bool smu_v13_0_6_is_unified_metrics(struct smu_context > *smu) -{ > - return (smu->adev->flags & AMD_IS_APU) && > - smu->smc_fw_version <= 0x4556900; > -} > - > -static inline bool smu_v13_0_6_is_other_end_count_available(struct > smu_context *smu) -{ > - switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { > - case IP_VERSION(13, 0, 6): > - return smu->smc_fw_version >= 0x557600; > - case IP_VERSION(13, 0, 14): > - return smu->smc_fw_version >= 0x05550E00; > - default: > - return false; > - } > -} > - > -static inline bool smu_v13_0_6_is_blw_host_limit_available(struct > smu_context *smu) -{ > - if (smu->adev->flags & AMD_IS_APU) > - return smu->smc_fw_version >= 0x04556F00; > +enum smu_v13_0_6_caps { > + SMU_13_0_6_CAPS_DPM, > + SMU_13_0_6_CAPS_UNI_METRICS, > + SMU_13_0_6_CAPS_DPM_POLICY, > + SMU_13_0_6_CAPS_OTHER_END_METRICS, > + SMU_13_0_6_CAPS_SET_UCLK_MAX, > + SMU_13_0_6_CAPS_PCIE_METRICS, > + SMU_13_0_6_CAPS_HST_LIMIT_METRICS, > + SMU_13_0_6_CAPS_MCA_DEBUG_MODE, > + SMU_13_0_6_CAPS_PER_INST_METRICS, > + SMU_13_0_6_CAPS_CTF_LIMIT, > + SMU_13_0_6_CAPS_RMA_MSG, > + SMU_13_0_6_CAPS_ACA_SYND, > + SMU_13_0_6_CAPS_SDMA_RESET, > + SMU_13_0_6_CAPS_ALL, > +}; > > - switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { > - case IP_VERSION(13, 0, 6): > - return smu->smc_fw_version >= 0x557900; > - case IP_VERSION(13, 0, 14): > - return smu->smc_fw_version >= 0x05551000; > - default: > - return false; > - } > -} > +#define SMU_CAPS_MASK(x) (ULL(1) << x) #define SMU_CAPS(x) > +SMU_CAPS_MASK(SMU_13_0_6_CAPS_##x) > > struct mca_bank_ipid { > enum amdgpu_mca_ip ip; > @@ -297,6 +284,119 @@ struct smu_v13_0_6_dpm_map { > uint32_t *freq_table; > }; > > +static void smu_v13_0_14_init_caps(struct smu_context *smu) { > + struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; > + uint64_t caps = SMU_CAPS(DPM) | SMU_CAPS(UNI_METRICS) | > + SMU_CAPS(SET_UCLK_MAX) | SMU_CAPS(DPM_POLICY) | > + SMU_CAPS(PCIE_METRICS) | SMU_CAPS(CTF_LIMIT) | > + SMU_CAPS(MCA_DEBUG_MODE) | SMU_CAPS(RMA_MSG) | > + SMU_CAPS(ACA_SYND); > + uint32_t fw_ver = smu->smc_fw_version; > + > + if (fw_ver >= 0x05550E00) > + caps |= SMU_CAPS(OTHER_END_METRICS); > + if (fw_ver >= 0x05551000) > + caps |= SMU_CAPS(HST_LIMIT_METRICS); > + if (fw_ver >= 0x05550B00) > + caps |= SMU_CAPS(PER_INST_METRICS); > + if (fw_ver > 0x05550f00) > + caps |= SMU_CAPS(SDMA_RESET); > + > + dpm_context->caps = caps; > +} > + > +static void smu_v13_0_6_init_caps(struct smu_context *smu) { > + uint64_t caps = SMU_CAPS(DPM) | SMU_CAPS(UNI_METRICS) | > + SMU_CAPS(SET_UCLK_MAX) | SMU_CAPS(DPM_POLICY) | > + SMU_CAPS(PCIE_METRICS) | SMU_CAPS(MCA_DEBUG_MODE) | > + SMU_CAPS(CTF_LIMIT) | SMU_CAPS(RMA_MSG) | > + SMU_CAPS(ACA_SYND); > + struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; > + struct amdgpu_device *adev = smu->adev; > + uint32_t fw_ver = smu->smc_fw_version; > + uint32_t pgm = (fw_ver >> 24) & 0xFF; > + > + if (fw_ver < 0x552F00) > + caps &= ~SMU_CAPS(DPM); > + > + if (adev->flags & AMD_IS_APU) { > + caps &= ~SMU_CAPS(PCIE_METRICS); > + caps &= ~SMU_CAPS(SET_UCLK_MAX); > + caps &= ~SMU_CAPS(DPM_POLICY); > + caps &= ~SMU_CAPS(RMA_MSG); > + caps &= ~SMU_CAPS(ACA_SYND); > + > + if (fw_ver <= 0x4556900) > + caps &= ~SMU_CAPS(UNI_METRICS); > + > + if (fw_ver >= 0x04556F00) > + caps |= SMU_CAPS(HST_LIMIT_METRICS); > + if (fw_ver >= 0x04556A00) > + caps |= SMU_CAPS(PER_INST_METRICS); > + if (fw_ver < 0x554500) > + caps &= ~SMU_CAPS(CTF_LIMIT); > + } else { > + if (fw_ver >= 0x557600) > + caps |= SMU_CAPS(OTHER_END_METRICS); > + if (fw_ver < 0x00556000) > + caps &= ~SMU_CAPS(DPM_POLICY); > + if (amdgpu_sriov_vf(adev) && (fw_ver < 0x556600)) > + caps &= ~SMU_CAPS(SET_UCLK_MAX); > + if (fw_ver < 0x556300) > + caps &= ~SMU_CAPS(PCIE_METRICS); > + if (fw_ver < 0x554800) > + caps &= ~SMU_CAPS(MCA_DEBUG_MODE); > + if (fw_ver >= 0x556F00) > + caps |= SMU_CAPS(PER_INST_METRICS); > + if (fw_ver < 0x554500) > + caps &= ~SMU_CAPS(CTF_LIMIT); > + if (fw_ver < 0x00555a00) > + caps &= ~SMU_CAPS(RMA_MSG); > + if (fw_ver < 0x00555600) > + caps &= ~SMU_CAPS(ACA_SYND); > + if (pgm == 0 && fw_ver >= 0x557900) > + caps |= SMU_CAPS(HST_LIMIT_METRICS); > + } > + if (((pgm == 7) && (fw_ver > 0x07550700)) || > + ((pgm == 0) && (fw_ver > 0x00557700)) || > + ((pgm == 4) && (fw_ver > 0x4556e6c))) > + caps |= SMU_CAPS(SDMA_RESET); > + > + dpm_context->caps = caps; > +} > + > +static inline bool smu_v13_0_6_caps_supported(struct smu_context *smu, > + enum smu_v13_0_6_caps > +caps) { > + struct smu_13_0_dpm_context *dpm_context = > +smu->smu_dpm.dpm_context; > + > + return (dpm_context->caps & SMU_CAPS_MASK(caps)) == > +SMU_CAPS_MASK(caps); } > + > +static void smu_v13_0_x_init_caps(struct smu_context *smu) { > + switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { > + case IP_VERSION(13, 0, 14): > + return smu_v13_0_14_init_caps(smu); > + default: > + return smu_v13_0_6_init_caps(smu); > + } > +} > + > +static int smu_v13_0_6_check_fw_version(struct smu_context *smu) { > + int r; > + > + r = smu_v13_0_check_fw_version(smu); > + /* Initialize caps flags once fw version is fetched */ > + if (!r) > + smu_v13_0_x_init_caps(smu); > + > + return r; > +} > + > static int smu_v13_0_6_init_microcode(struct smu_context *smu) { > const struct smc_firmware_header_v2_1 *v2_1; @@ -616,7 +716,7 > @@ static int smu_v13_0_6_setup_driver_pptable(struct smu_context *smu) > MetricsTableA_t *metrics_a = (MetricsTableA_t *)smu_table->metrics_table; > struct PPTable_t *pptable = > (struct PPTable_t *)smu_table->driver_pptable; > - bool flag = smu_v13_0_6_is_unified_metrics(smu); > + bool flag = !smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(UNI_METRICS)); > int ret, i, retry = 100; > uint32_t table_version; > > @@ -812,8 +912,7 @@ static int smu_v13_0_6_set_default_dpm_table(struct smu_context *smu) > smu_v13_0_6_setup_driver_pptable(smu); > > /* DPM policy not supported in older firmwares */ > - if (!(smu->adev->flags & AMD_IS_APU) && > - (smu->smc_fw_version < 0x00556000)) { > + if (!smu_v13_0_6_caps_supported(smu, SMU_CAPS(DPM_POLICY))) { > struct smu_dpm_context *smu_dpm = &smu->smu_dpm; > > smu_dpm->dpm_policies->policy_mask &= @@ -990,7 > +1089,7 @@ static int smu_v13_0_6_get_smu_metrics_data(struct smu_context *smu, > struct smu_table_context *smu_table = &smu->smu_table; > MetricsTableX_t *metrics_x = (MetricsTableX_t *)smu_table->metrics_table; > MetricsTableA_t *metrics_a = (MetricsTableA_t *)smu_table->metrics_table; > - bool flag = smu_v13_0_6_is_unified_metrics(smu); > + bool flag = !smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(UNI_METRICS)); > struct amdgpu_device *adev = smu->adev; > int ret = 0; > int xcc_id; > @@ -1003,7 +1102,7 @@ static int smu_v13_0_6_get_smu_metrics_data(struct smu_context *smu, > switch (member) { > case METRICS_CURR_GFXCLK: > case METRICS_AVERAGE_GFXCLK: > - if (smu->smc_fw_version >= 0x552F00) { > + if (smu_v13_0_6_caps_supported(smu, SMU_CAPS(DPM))) { > xcc_id = GET_INST(GC, 0); > *value = SMUQ10_ROUND(GET_METRIC_FIELD(GfxclkFrequency, flag)[xcc_id]); > } else { > @@ -1692,7 +1791,7 @@ static int smu_v13_0_6_notify_unload(struct > smu_context *smu) static int smu_v13_0_6_mca_set_debug_mode(struct > smu_context *smu, bool enable) { > /* NOTE: this ClearMcaOnRead message is only supported for smu version 85.72.0 or higher */ > - if (smu->smc_fw_version < 0x554800) > + if (!smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(MCA_DEBUG_MODE))) > return 0; > > return smu_cmn_send_smc_msg_with_param(smu, > SMU_MSG_ClearMcaOnRead, @@ -1837,9 +1936,8 @@ static int smu_v13_0_6_set_soft_freq_limited_range(struct smu_context *smu, > if (max == pstate_table->uclk_pstate.curr.max) > return 0; > /* For VF, only allowed in FW versions 85.102 or greater */ > - if (amdgpu_sriov_vf(adev) && > - ((smu->smc_fw_version < 0x556600) || > - (adev->flags & AMD_IS_APU))) > + if (!smu_v13_0_6_caps_supported(smu, > + > + SMU_CAPS(SET_UCLK_MAX))) > return -EOPNOTSUPP; > /* Only max clock limiting is allowed for UCLK */ > ret = smu_v13_0_set_soft_freq_limited_range( > @@ -2043,7 +2141,7 @@ static int smu_v13_0_6_get_enabled_mask(struct > smu_context *smu, > > ret = smu_cmn_get_enabled_mask(smu, feature_mask); > > - if (ret == -EIO && smu->smc_fw_version < 0x552F00) { > + if (ret == -EIO && !smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(DPM))) { > *feature_mask = 0; > ret = 0; > } > @@ -2336,11 +2434,10 @@ static int > smu_v13_0_6_get_current_pcie_link_speed(struct smu_context *smu) > > static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, > void **table) { > - bool per_inst, smu_13_0_6_per_inst, smu_13_0_14_per_inst, apu_per_inst; > struct smu_table_context *smu_table = &smu->smu_table; > struct gpu_metrics_v1_7 *gpu_metrics = > (struct gpu_metrics_v1_7 *)smu_table->gpu_metrics_table; > - bool flag = smu_v13_0_6_is_unified_metrics(smu); > + bool flag = !smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(UNI_METRICS)); > int ret = 0, xcc_id, inst, i, j, k, idx; > struct amdgpu_device *adev = smu->adev; > MetricsTableX_t *metrics_x; > @@ -2348,6 +2445,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table > struct amdgpu_xcp *xcp; > u16 link_width_level; > u32 inst_mask; > + bool per_inst; > > metrics_x = kzalloc(max(sizeof(MetricsTableX_t), sizeof(MetricsTableA_t)), GFP_KERNEL); > ret = smu_v13_0_6_get_metrics_table(smu, metrics_x, true); @@ > -2421,7 +2519,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table > * table for both pf & one vf for smu version 85.99.0 or higher else report only > * for pf from registers > */ > - if (smu->smc_fw_version >= 0x556300) { > + if (smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(PCIE_METRICS))) { > gpu_metrics->pcie_link_width = metrics_x->PCIeLinkWidth; > gpu_metrics->pcie_link_speed = > > pcie_gen_to_speed(metrics_x->PCIeLinkSpeed); > @@ -2450,7 +2548,8 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table > metrics_x->PCIeNAKSentCountAcc; > gpu_metrics->pcie_nak_rcvd_count_acc = > metrics_x->PCIeNAKReceivedCountAcc; > - if (smu_v13_0_6_is_other_end_count_available(smu)) > + if (smu_v13_0_6_caps_supported(smu, > + > + SMU_CAPS(OTHER_END_METRICS))) > gpu_metrics->pcie_lc_perf_other_end_recovery = > metrics_x->PCIeOtherEndRecoveryAcc; > > @@ -2475,17 +2574,7 @@ static ssize_t > smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table > > gpu_metrics->num_partition = adev->xcp_mgr->num_xcps; > > - apu_per_inst = (adev->flags & AMD_IS_APU) && (smu->smc_fw_version >= 0x04556A00); > - smu_13_0_6_per_inst = !(adev->flags & AMD_IS_APU) && > - (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) > - == IP_VERSION(13, 0, 6)) && > - (smu->smc_fw_version >= 0x556F00); > - smu_13_0_14_per_inst = !(adev->flags & AMD_IS_APU) && > - (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) > - == IP_VERSION(13, 0, 14)) && > - (smu->smc_fw_version >= 0x05550B00); > - > - per_inst = apu_per_inst || smu_13_0_6_per_inst || smu_13_0_14_per_inst; > + per_inst = smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(PER_INST_METRICS)); > > for_each_xcp(adev->xcp_mgr, xcp, i) { > amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_VCN, > &inst_mask); @@ -2516,7 +2605,8 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table > gpu_metrics->xcp_stats[i].gfx_busy_acc[idx] = > > SMUQ10_ROUND(metrics_x->GfxBusyAcc[inst]); > > - if (smu_v13_0_6_is_blw_host_limit_available(smu)) > + if (smu_v13_0_6_caps_supported( > + smu, > + SMU_CAPS(HST_LIMIT_METRICS))) > gpu_metrics->xcp_stats[i].gfx_below_host_limit_acc[idx] = > SMUQ10_ROUND(metrics_x->GfxclkBelowHostLimitAcc > > [inst]); @@ -2624,7 +2714,7 @@ static int smu_v13_0_6_get_thermal_temperature_range(struct smu_context *smu, > return -EINVAL; > > /*Check smu version, GetCtfLimit message only supported for smu version 85.69 or higher */ > - if (smu->smc_fw_version < 0x554500) > + if (!smu_v13_0_6_caps_supported(smu, SMU_CAPS(CTF_LIMIT))) > return 0; > > /* Get SOC Max operating temperature */ @@ -2726,11 +2816,10 > @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context > *smu, > > static int smu_v13_0_6_send_rma_reason(struct smu_context *smu) { > - struct amdgpu_device *adev = smu->adev; > int ret; > > /* NOTE: the message is only valid on dGPU with pmfw 85.90.0 and above */ > - if ((adev->flags & AMD_IS_APU) || smu->smc_fw_version < 0x00555a00) > + if (!smu_v13_0_6_caps_supported(smu, SMU_CAPS(RMA_MSG))) > return 0; > > ret = smu_cmn_send_smc_msg(smu, > SMU_MSG_RmaDueToBadPageThreshold, NULL); @@ -2750,18 +2839,17 @@ static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t inst_mask) > smu_program = (smu->smc_fw_version >> 24) & 0xff; > switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { > case IP_VERSION(13, 0, 6): > - if (((smu_program == 7) && (smu->smc_fw_version > 0x07550700)) || > - ((smu_program == 0) && (smu->smc_fw_version > 0x00557700))) > + if ((smu_program == 7 || smu_program == 0) && > + smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(SDMA_RESET))) > ret = smu_cmn_send_smc_msg_with_param(smu, > SMU_MSG_ResetSDMA, inst_mask, NULL); > else if ((smu_program == 4) && > - (smu->smc_fw_version > 0x4556e6c)) > + smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(SDMA_RESET))) > ret = smu_cmn_send_smc_msg_with_param(smu, > SMU_MSG_ResetSDMA2, inst_mask, NULL); > break; > case IP_VERSION(13, 0, 14): > - if ((smu_program == 5) && > - (smu->smc_fw_version > 0x05550f00)) > + if (smu_v13_0_6_caps_supported(smu, > + SMU_CAPS(SDMA_RESET))) > ret = smu_cmn_send_smc_msg_with_param(smu, > SMU_MSG_ResetSDMA2, inst_mask, NULL); > break; > @@ -3087,7 +3175,7 @@ static bool mca_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amd > if (instlo != 0x03b30400) > return false; > > - if (!(adev->flags & AMD_IS_APU) && smu->smc_fw_version >= 0x00555600) { > + if (smu_v13_0_6_caps_supported(smu, SMU_CAPS(ACA_SYND))) { > errcode = MCA_REG__SYND__ERRORINFORMATION(entry->regs[MCA_REG_IDX_SYND]); > errcode &= 0xff; > } else { > @@ -3373,9 +3461,10 @@ static int aca_smu_get_valid_aca_bank(struct > amdgpu_device *adev, > > static int aca_smu_parse_error_code(struct amdgpu_device *adev, > struct aca_bank *bank) { > + struct smu_context *smu = adev->powerplay.pp_handle; > int error_code; > > - if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) > + if (smu_v13_0_6_caps_supported(smu, SMU_CAPS(ACA_SYND))) > error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); > else > error_code = > ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); > @@ -3413,7 +3502,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { > .fini_power = smu_v13_0_fini_power, > .check_fw_status = smu_v13_0_6_check_fw_status, > /* pptable related */ > - .check_fw_version = smu_v13_0_check_fw_version, > + .check_fw_version = smu_v13_0_6_check_fw_version, > .set_driver_table_location = smu_v13_0_set_driver_table_location, > .set_tool_table_location = smu_v13_0_set_tool_table_location, > .notify_memory_pool_location = > smu_v13_0_notify_memory_pool_location, > -- > 2.25.1 >