[AMD Official Use Only - General]
Reviewed-by: Bhawanpreet Lakha <Bhawanpreet.Lakha@xxxxxxx>
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> on behalf of Aurabindo Pillai <aurabindo.pillai@xxxxxxx>
Sent: March 10, 2023 12:56 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Wentland, Harry <Harry.Wentland@xxxxxxx>; Siqueira, Rodrigo <Rodrigo.Siqueira@xxxxxxx>; Mahfooz, Hamza <Hamza.Mahfooz@xxxxxxx> Subject: Re: [PATCH 2/2] drm/amd/display: Enable FAMS for DCN3x On 3/10/23 12:48, Aurabindo Pillai wrote: > [Why&How] > Firmware Assisted Memclk Switching enables lowering mclk using DMCUB > when it cannot be normally done due to not having enough time within > vblank. FAMS extends vblank on monitors that support variable refresh > rate thereby allowing enough time to do an mclk switch sequence > during vblank. > > When tested with 4k@144Hz monitor on DCN32, power consumption of about > 40W was saved since multiple clocks like MCLK, SOCCLK, and FCLK > were brought down. > > Signed-off-by: Aurabindo Pillai <aurabindo.pillai@xxxxxxx> > Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@xxxxxxx> > --- > .../gpu/drm/amd/display/dc/dcn30/dcn30_optc.c | 7 +- > .../drm/amd/display/dc/dcn30/dcn30_resource.h | 3 + > .../drm/amd/display/dc/dcn31/dcn31_hwseq.c | 4 ++ > .../drm/amd/display/dc/dcn32/dcn32_hwseq.c | 2 + > .../drm/amd/display/dc/dcn32/dcn32_resource.c | 2 +- > .../drm/amd/display/dc/dml/dcn30/dcn30_fpu.c | 71 ++++++++++++++++--- > .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 5 ++ > 7 files changed, 84 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c > index 08b92715e2e6..9963bffb1e07 100644 > --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c > +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_optc.c > @@ -301,7 +301,12 @@ void optc3_wait_drr_doublebuffer_pending_clear(struct timing_generator *optc) > > void optc3_set_vtotal_min_max(struct timing_generator *optc, int vtotal_min, int vtotal_max) > { > - optc1_set_vtotal_min_max(optc, vtotal_min, vtotal_max); > + struct dc *dc = optc->ctx->dc; > + > + if (dc->caps.dmub_caps.mclk_sw && !dc->debug.disable_fams) > + dc_dmub_srv_drr_update_cmd(dc, optc->inst, vtotal_min, vtotal_max); > + else > + optc1_set_vtotal_min_max(optc, vtotal_min, vtotal_max); > } > > void optc3_tg_init(struct timing_generator *optc) > diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.h b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.h > index 8e6b8b7368fd..d8805618a9a1 100644 > --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.h > +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_resource.h > @@ -102,6 +102,9 @@ void dcn30_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params > > bool dcn30_can_support_mclk_switch_using_fw_based_vblank_stretch(struct dc *dc, struct dc_state *context); > void dcn30_setup_mclk_switch_using_fw_based_vblank_stretch(struct dc *dc, struct dc_state *context); > + > +void dcn30_setup_mclk_switch_using_fw_based_vblank_stretch(struct dc *dc, struct dc_state *context); > + This is duplicate and will remove before applying. > int dcn30_find_dummy_latency_index_for_fw_based_mclk_switch(struct dc *dc, struct dc_state *context, > display_e2e_pipe_params_st *pipes, int pipe_cnt, int vlevel); > > diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c > index 80a0c5a575a9..40080113ed5e 100644 > --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c > +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_hwseq.c > @@ -295,6 +295,10 @@ void dcn31_init_hw(struct dc *dc) > if (dc->res_pool->hubbub->funcs->init_crb) > dc->res_pool->hubbub->funcs->init_crb(dc->res_pool->hubbub); > #endif > + /* Get DMCUB capabilities */ > + dc_dmub_srv_query_caps_cmd(dc->ctx->dmub_srv->dmub); > + dc->caps.dmub_caps.psr = dc->ctx->dmub_srv->dmub->feature_caps.psr; > + dc->caps.dmub_caps.mclk_sw = dc->ctx->dmub_srv->dmub->feature_caps.fw_assisted_mclk_switch; > } > > void dcn31_dsc_pg_control( > diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c > index f87db2271924..3220f9ad8a47 100644 > --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c > +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hwseq.c > @@ -919,6 +919,8 @@ void dcn32_init_hw(struct dc *dc) > if (dc->ctx->dmub_srv) { > dc_dmub_srv_query_caps_cmd(dc->ctx->dmub_srv->dmub); > dc->caps.dmub_caps.psr = dc->ctx->dmub_srv->dmub->feature_caps.psr; > + dc->caps.dmub_caps.mclk_sw = dc->ctx->dmub_srv->dmub->feature_caps.fw_assisted_mclk_switch; > + Will remove the extra newline before applying > } > } > > diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c > index 100b6df33b33..b1944e49a65d 100644 > --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c > +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource.c > @@ -2013,7 +2013,7 @@ int dcn32_populate_dml_pipes_from_context( > // In general cases we want to keep the dram clock change requirement > // (prefer configs that support MCLK switch). Only override to false > // for SubVP > - if (subvp_in_use) > + if (context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching || subvp_in_use) > context->bw_ctx.dml.soc.dram_clock_change_requirement_final = false; > else > context->bw_ctx.dml.soc.dram_clock_change_requirement_final = true; > diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/dcn30_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/dcn30_fpu.c > index 4fa636364793..53f21b0b3630 100644 > --- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/dcn30_fpu.c > +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/dcn30_fpu.c > @@ -368,7 +368,9 @@ void dcn30_fpu_update_soc_for_wm_a(struct dc *dc, struct dc_state *context) > dc_assert_fp_enabled(); > > if (dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].valid) { > - context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.pstate_latency_us; > + if (!context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching || > + context->bw_ctx.dml.soc.dram_clock_change_latency_us == 0) > + context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.pstate_latency_us; > context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.sr_enter_plus_exit_time_us; > context->bw_ctx.dml.soc.sr_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.sr_exit_time_us; > } > @@ -384,9 +386,34 @@ void dcn30_fpu_calculate_wm_and_dlg( > int i, pipe_idx; > double dcfclk = context->bw_ctx.dml.vba.DCFCLKState[vlevel][maxMpcComb]; > bool pstate_en = context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][maxMpcComb] != dm_dram_clock_change_unsupported; > + unsigned int dummy_latency_index = 0; > > dc_assert_fp_enabled(); > > + context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching = false; > + > + if (!pstate_en) { > + /* only when the mclk switch can not be natural, is the fw based vblank stretch attempted */ > + context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching = > + dcn30_can_support_mclk_switch_using_fw_based_vblank_stretch(dc, context); > + > + if (context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching) { > + dummy_latency_index = dcn30_find_dummy_latency_index_for_fw_based_mclk_switch(dc, > + context, pipes, pipe_cnt, vlevel); > + > + /* After calling dcn30_find_dummy_latency_index_for_fw_based_mclk_switch > + * we reinstate the original dram_clock_change_latency_us on the context > + * and all variables that may have changed up to this point, except the > + * newly found dummy_latency_index > + */ > + context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_A].dml_input.pstate_latency_us; > + dcn30_internal_validate_bw(dc, context, pipes, &pipe_cnt, &vlevel, false, true); > + maxMpcComb = context->bw_ctx.dml.vba.maxMpcComb; > + dcfclk = context->bw_ctx.dml.vba.DCFCLKState[vlevel][context->bw_ctx.dml.vba.maxMpcComb]; > + pstate_en = context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][maxMpcComb] != dm_dram_clock_change_unsupported; > + } > + } > + > if (context->bw_ctx.dml.soc.min_dcfclk > dcfclk) > dcfclk = context->bw_ctx.dml.soc.min_dcfclk; > > @@ -449,15 +476,29 @@ void dcn30_fpu_calculate_wm_and_dlg( > unsigned int min_dram_speed_mts = context->bw_ctx.dml.vba.DRAMSpeed; > unsigned int min_dram_speed_mts_margin = 160; > > - if (context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] == dm_dram_clock_change_unsupported) > - min_dram_speed_mts = dc->clk_mgr->bw_params->clk_table.entries[dc->clk_mgr->bw_params->clk_table.num_entries - 1].memclk_mhz * 16; > + context->bw_ctx.dml.soc.dram_clock_change_latency_us = > + dc->clk_mgr->bw_params->dummy_pstate_table[0].dummy_pstate_latency_us; > > - /* find largest table entry that is lower than dram speed, but lower than DPM0 still uses DPM0 */ > - for (i = 3; i > 0; i--) > - if (min_dram_speed_mts + min_dram_speed_mts_margin > dc->clk_mgr->bw_params->dummy_pstate_table[i].dram_speed_mts) > - break; > + if (context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][maxMpcComb] == > + dm_dram_clock_change_unsupported) { > + int min_dram_speed_mts_offset = dc->clk_mgr->bw_params->clk_table.num_entries - 1; > + > + min_dram_speed_mts = > + dc->clk_mgr->bw_params->clk_table.entries[min_dram_speed_mts_offset].memclk_mhz * 16; > + } > > - context->bw_ctx.dml.soc.dram_clock_change_latency_us = dc->clk_mgr->bw_params->dummy_pstate_table[i].dummy_pstate_latency_us; > + if (!context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching) { > + /* find largest table entry that is lower than dram speed, > + * but lower than DPM0 still uses DPM0 > + */ > + for (dummy_latency_index = 3; dummy_latency_index > 0; dummy_latency_index--) > + if (min_dram_speed_mts + min_dram_speed_mts_margin > > + dc->clk_mgr->bw_params->dummy_pstate_table[dummy_latency_index].dram_speed_mts) > + break; > + } > + > + context->bw_ctx.dml.soc.dram_clock_change_latency_us = > + dc->clk_mgr->bw_params->dummy_pstate_table[dummy_latency_index].dummy_pstate_latency_us; > > context->bw_ctx.dml.soc.sr_enter_plus_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].dml_input.sr_enter_plus_exit_time_us; > context->bw_ctx.dml.soc.sr_exit_time_us = dc->clk_mgr->bw_params->wm_table.nv_entries[WM_C].dml_input.sr_exit_time_us; > @@ -520,6 +561,20 @@ void dcn30_fpu_calculate_wm_and_dlg( > pipe_idx++; > } > > + /* WA: restrict FW MCLK switch to use first non-strobe mode (Beige Goby BW issue) */ > + if (context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching && > + dc->dml.soc.num_chans <= 4 && > + context->bw_ctx.dml.vba.DRAMSpeed <= 1700 && > + context->bw_ctx.dml.vba.DRAMSpeed >= 1500) { > + > + for (i = 0; i < dc->dml.soc.num_states; i++) { > + if (dc->dml.soc.clock_limits[i].dram_speed_mts > 1700) { > + context->bw_ctx.dml.vba.DRAMSpeed = dc->dml.soc.clock_limits[i].dram_speed_mts; > + break; > + } > + } > + } > + > dcn20_calculate_dlg_params(dc, context, pipes, pipe_cnt, vlevel); > > if (!pstate_en) > diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c > index 077674be452b..ee2683200799 100644 > --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c > +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c > @@ -1331,6 +1331,11 @@ static void dcn32_calculate_dlg_params(struct dc *dc, struct dc_state *context, > context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] > != dm_dram_clock_change_unsupported; > > + /* Pstate change might not be supported by hardware, but it might be > + * possible with firmware driven vertical blank stretching. > + */ > + context->bw_ctx.bw.dcn.clk.p_state_change_support |= context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching; > + > context->bw_ctx.bw.dcn.clk.dppclk_khz = 0; > context->bw_ctx.bw.dcn.clk.dtbclk_en = is_dtbclk_required(dc, context); > context->bw_ctx.bw.dcn.clk.ref_dtbclk_khz = context->bw_ctx.dml.vba.DTBCLKPerState[vlevel] * 1000; |