On Tue, Sep 7, 2021 at 10:10 AM Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > > Do I know why? No. I do note that that code is disgusting. > > It's passing one of those structs around by value, for example. That's > a 72-byte structure that is copied on the stack due to stupid calling > conventions. Maybe clang generates a few extra temporaries for it as > part of the function call stack setup? Who knows.. Ooh, yes. This attached patch is crap - it converts the helper functions to use const pointers instead of passing the whole structure, but it then only converts that one file that *uses* them. So the end result will not compile in general, but you can do make drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn30/display_rq_dlg_calc_30.o and it compiles for me. And while gcc doesn't care that much - it will apparently either generate the argument stack every call - clang cares deeply. The nasty 720-byte stack frame that clang generates turns into just a 320-byte one, and code generation in general looks a *lot* better. Now, as mentioned, this patch is broken and incomplete. But I really think the AMD GPU people need to do this. It makes those functions go from practically unusable to not horribly disgusting. So Harry/Leo/Alex/Christian and amd-gfx list - can you look into making this ugly "make one file compile better" patch actually work properly? It *looks* lto me ike that code was perhaps written for a C++ compiler and the helpers have been written as a "pass by reference", and the arguments used to be const display_data_rq_misc_params_st& rq_misc_param and then the compiler will pass the argument as a pointer. And then it was converted to C, and the "pass by reference" in the function declaration was turned into "pass by value", to avoid changing "." to "->" in the use. But a '&arg' thing in C++ really is a '*arg' pointer in C, and should have been done as that. Of course, it's also possible that that code was simply written by somebody who didn't understand just *how* horrible it is to pass structures bigger than a word or two by value. Do we have a compiler warning for passing big structures by value? Linus
.../display/dc/dml/dcn30/display_rq_dlg_calc_30.c | 142 ++++++++++----------- .../display/dc/dml/dcn30/display_rq_dlg_calc_30.h | 2 +- .../amd/display/dc/dml/display_rq_dlg_helpers.h | 20 +-- 3 files changed, 82 insertions(+), 82 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c index 0d934fae1c3a..9b5ee53d2de3 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.c @@ -92,7 +92,7 @@ static void extract_rq_sizing_regs(struct display_mode_lib *mode_lib, const display_data_rq_sizing_params_st rq_sizing) { dml_print("DML_DLG: %s: rq_sizing param\n", __func__); - print__data_rq_sizing_params_st(mode_lib, rq_sizing); + print__data_rq_sizing_params_st(mode_lib, &rq_sizing); rq_regs->chunk_size = dml_log2(rq_sizing.chunk_bytes) - 10; @@ -113,28 +113,28 @@ static void extract_rq_sizing_regs(struct display_mode_lib *mode_lib, static void extract_rq_regs(struct display_mode_lib *mode_lib, display_rq_regs_st *rq_regs, - const display_rq_params_st rq_param) + const display_rq_params_st *rq_param) { unsigned int detile_buf_size_in_bytes = mode_lib->ip.det_buffer_size_kbytes * 1024; unsigned int detile_buf_plane1_addr = 0; - extract_rq_sizing_regs(mode_lib, &(rq_regs->rq_regs_l), rq_param.sizing.rq_l); + extract_rq_sizing_regs(mode_lib, &(rq_regs->rq_regs_l), rq_param->sizing.rq_l); - rq_regs->rq_regs_l.pte_row_height_linear = dml_floor(dml_log2(rq_param.dlg.rq_l.dpte_row_height), + rq_regs->rq_regs_l.pte_row_height_linear = dml_floor(dml_log2(rq_param->dlg.rq_l.dpte_row_height), 1) - 3; - if (rq_param.yuv420) { - extract_rq_sizing_regs(mode_lib, &(rq_regs->rq_regs_c), rq_param.sizing.rq_c); - rq_regs->rq_regs_c.pte_row_height_linear = dml_floor(dml_log2(rq_param.dlg.rq_c.dpte_row_height), + if (rq_param->yuv420) { + extract_rq_sizing_regs(mode_lib, &(rq_regs->rq_regs_c), rq_param->sizing.rq_c); + rq_regs->rq_regs_c.pte_row_height_linear = dml_floor(dml_log2(rq_param->dlg.rq_c.dpte_row_height), 1) - 3; } - rq_regs->rq_regs_l.swath_height = dml_log2(rq_param.dlg.rq_l.swath_height); - rq_regs->rq_regs_c.swath_height = dml_log2(rq_param.dlg.rq_c.swath_height); + rq_regs->rq_regs_l.swath_height = dml_log2(rq_param->dlg.rq_l.swath_height); + rq_regs->rq_regs_c.swath_height = dml_log2(rq_param->dlg.rq_c.swath_height); // FIXME: take the max between luma, chroma chunk size? // okay for now, as we are setting chunk_bytes to 8kb anyways - if (rq_param.sizing.rq_l.chunk_bytes >= 32 * 1024 || (rq_param.yuv420 && rq_param.sizing.rq_c.chunk_bytes >= 32 * 1024)) { //32kb + if (rq_param->sizing.rq_l.chunk_bytes >= 32 * 1024 || (rq_param->yuv420 && rq_param->sizing.rq_c.chunk_bytes >= 32 * 1024)) { //32kb rq_regs->drq_expansion_mode = 0; } else { rq_regs->drq_expansion_mode = 2; @@ -143,9 +143,9 @@ static void extract_rq_regs(struct display_mode_lib *mode_lib, rq_regs->mrq_expansion_mode = 1; rq_regs->crq_expansion_mode = 1; - if (rq_param.yuv420) { - if ((double)rq_param.misc.rq_l.stored_swath_bytes - / (double)rq_param.misc.rq_c.stored_swath_bytes <= 1.5) { + if (rq_param->yuv420) { + if ((double)rq_param->misc.rq_l.stored_swath_bytes + / (double)rq_param->misc.rq_c.stored_swath_bytes <= 1.5) { detile_buf_plane1_addr = (detile_buf_size_in_bytes / 2.0 / 64.0); // half to chroma } else { detile_buf_plane1_addr = dml_round_to_multiple((unsigned int)((2.0 * detile_buf_size_in_bytes) / 3.0), @@ -747,7 +747,7 @@ static void get_surf_rq_param(struct display_mode_lib *mode_lib, display_data_rq_sizing_params_st *rq_sizing_param, display_data_rq_dlg_params_st *rq_dlg_param, display_data_rq_misc_params_st *rq_misc_param, - const display_pipe_params_st pipe_param, + const display_pipe_params_st *pipe_param, bool is_chroma, bool is_alpha) { @@ -761,32 +761,32 @@ static void get_surf_rq_param(struct display_mode_lib *mode_lib, // FIXME check if ppe apply for both luma and chroma in 422 case if (is_chroma | is_alpha) { - vp_width = pipe_param.src.viewport_width_c / ppe; - vp_height = pipe_param.src.viewport_height_c; - data_pitch = pipe_param.src.data_pitch_c; - meta_pitch = pipe_param.src.meta_pitch_c; - surface_height = pipe_param.src.surface_height_y / 2.0; + vp_width = pipe_param->src.viewport_width_c / ppe; + vp_height = pipe_param->src.viewport_height_c; + data_pitch = pipe_param->src.data_pitch_c; + meta_pitch = pipe_param->src.meta_pitch_c; + surface_height = pipe_param->src.surface_height_y / 2.0; } else { - vp_width = pipe_param.src.viewport_width / ppe; - vp_height = pipe_param.src.viewport_height; - data_pitch = pipe_param.src.data_pitch; - meta_pitch = pipe_param.src.meta_pitch; - surface_height = pipe_param.src.surface_height_y; + vp_width = pipe_param->src.viewport_width / ppe; + vp_height = pipe_param->src.viewport_height; + data_pitch = pipe_param->src.data_pitch; + meta_pitch = pipe_param->src.meta_pitch; + surface_height = pipe_param->src.surface_height_y; } - if (pipe_param.dest.odm_combine) { + if (pipe_param->dest.odm_combine) { unsigned int access_dir = 0; unsigned int full_src_vp_width = 0; unsigned int hactive_odm = 0; unsigned int src_hactive_odm = 0; - access_dir = (pipe_param.src.source_scan == dm_vert); // vp access direction: horizontal or vertical accessed - hactive_odm = pipe_param.dest.hactive / ((unsigned int)pipe_param.dest.odm_combine*2); + access_dir = (pipe_param->src.source_scan == dm_vert); // vp access direction: horizontal or vertical accessed + hactive_odm = pipe_param->dest.hactive / ((unsigned int)pipe_param->dest.odm_combine*2); if (is_chroma) { - full_src_vp_width = pipe_param.scale_ratio_depth.hscl_ratio_c * pipe_param.dest.full_recout_width; - src_hactive_odm = pipe_param.scale_ratio_depth.hscl_ratio_c * hactive_odm; + full_src_vp_width = pipe_param->scale_ratio_depth.hscl_ratio_c * pipe_param->dest.full_recout_width; + src_hactive_odm = pipe_param->scale_ratio_depth.hscl_ratio_c * hactive_odm; } else { - full_src_vp_width = pipe_param.scale_ratio_depth.hscl_ratio * pipe_param.dest.full_recout_width; - src_hactive_odm = pipe_param.scale_ratio_depth.hscl_ratio * hactive_odm; + full_src_vp_width = pipe_param->scale_ratio_depth.hscl_ratio * pipe_param->dest.full_recout_width; + src_hactive_odm = pipe_param->scale_ratio_depth.hscl_ratio * hactive_odm; } if (access_dir == 0) { @@ -815,7 +815,7 @@ static void get_surf_rq_param(struct display_mode_lib *mode_lib, rq_sizing_param->meta_chunk_bytes = 2048; rq_sizing_param->min_meta_chunk_bytes = 256; - if (pipe_param.src.hostvm) + if (pipe_param->src.hostvm) rq_sizing_param->mpte_group_bytes = 512; else rq_sizing_param->mpte_group_bytes = 2048; @@ -828,28 +828,28 @@ static void get_surf_rq_param(struct display_mode_lib *mode_lib, vp_height, data_pitch, meta_pitch, - pipe_param.src.source_format, - pipe_param.src.sw_mode, - pipe_param.src.macro_tile_size, - pipe_param.src.source_scan, - pipe_param.src.hostvm, + pipe_param->src.source_format, + pipe_param->src.sw_mode, + pipe_param->src.macro_tile_size, + pipe_param->src.source_scan, + pipe_param->src.hostvm, is_chroma, surface_height); } static void dml_rq_dlg_get_rq_params(struct display_mode_lib *mode_lib, display_rq_params_st *rq_param, - const display_pipe_params_st pipe_param) + const display_pipe_params_st *pipe_param) { // get param for luma surface - rq_param->yuv420 = pipe_param.src.source_format == dm_420_8 - || pipe_param.src.source_format == dm_420_10 - || pipe_param.src.source_format == dm_rgbe_alpha - || pipe_param.src.source_format == dm_420_12; + rq_param->yuv420 = pipe_param->src.source_format == dm_420_8 + || pipe_param->src.source_format == dm_420_10 + || pipe_param->src.source_format == dm_rgbe_alpha + || pipe_param->src.source_format == dm_420_12; - rq_param->yuv420_10bpc = pipe_param.src.source_format == dm_420_10; + rq_param->yuv420_10bpc = pipe_param->src.source_format == dm_420_10; - rq_param->rgbe_alpha = (pipe_param.src.source_format == dm_rgbe_alpha)?1:0; + rq_param->rgbe_alpha = (pipe_param->src.source_format == dm_rgbe_alpha)?1:0; get_surf_rq_param(mode_lib, &(rq_param->sizing.rq_l), @@ -859,7 +859,7 @@ static void dml_rq_dlg_get_rq_params(struct display_mode_lib *mode_lib, 0, 0); - if (is_dual_plane((enum source_format_class)(pipe_param.src.source_format))) { + if (is_dual_plane((enum source_format_class)(pipe_param->src.source_format))) { // get param for chroma surface get_surf_rq_param(mode_lib, &(rq_param->sizing.rq_c), @@ -871,21 +871,21 @@ static void dml_rq_dlg_get_rq_params(struct display_mode_lib *mode_lib, } // calculate how to split the det buffer space between luma and chroma - handle_det_buf_split(mode_lib, rq_param, pipe_param.src); - print__rq_params_st(mode_lib, *rq_param); + handle_det_buf_split(mode_lib, rq_param, pipe_param->src); + print__rq_params_st(mode_lib, rq_param); } void dml30_rq_dlg_get_rq_reg(struct display_mode_lib *mode_lib, display_rq_regs_st *rq_regs, - const display_pipe_params_st pipe_param) + const display_pipe_params_st *pipe_param) { display_rq_params_st rq_param = { 0 }; memset(rq_regs, 0, sizeof(*rq_regs)); dml_rq_dlg_get_rq_params(mode_lib, &rq_param, pipe_param); - extract_rq_regs(mode_lib, rq_regs, rq_param); + extract_rq_regs(mode_lib, rq_regs, &rq_param); - print__rq_regs_st(mode_lib, *rq_regs); + print__rq_regs_st(mode_lib, rq_regs); } static void calculate_ttu_cursor(struct display_mode_lib *mode_lib, @@ -984,8 +984,8 @@ static void dml_rq_dlg_get_dlg_params(struct display_mode_lib *mode_lib, const unsigned int pipe_idx, display_dlg_regs_st *disp_dlg_regs, display_ttu_regs_st *disp_ttu_regs, - const display_rq_dlg_params_st rq_dlg_param, - const display_dlg_sys_params_st dlg_sys_param, + const display_rq_dlg_params_st *rq_dlg_param, + const display_dlg_sys_params_st *dlg_sys_param, const bool cstate_en, const bool pstate_en, const bool vm_en, @@ -1137,7 +1137,7 @@ static void dml_rq_dlg_get_dlg_params(struct display_mode_lib *mode_lib, * dml_pow(2, 8)); disp_dlg_regs->dlg_vblank_end = interlaced ? (vblank_end / 2) : vblank_end; // 15 bits - min_dcfclk_mhz = dlg_sys_param.deepsleep_dcfclk_mhz; + min_dcfclk_mhz = dlg_sys_param->deepsleep_dcfclk_mhz; t_calc_us = get_tcalc(mode_lib, e2e_pipe_param, num_pipes); min_ttu_vblank = get_min_ttu_vblank(mode_lib, e2e_pipe_param, num_pipes, pipe_idx); @@ -1191,13 +1191,13 @@ static void dml_rq_dlg_get_dlg_params(struct display_mode_lib *mode_lib, scl_enable = scl->scl_enable; line_time_in_us = (htotal / pclk_freq_in_mhz); - swath_width_ub_l = rq_dlg_param.rq_l.swath_width_ub; - dpte_groups_per_row_ub_l = rq_dlg_param.rq_l.dpte_groups_per_row_ub; - swath_width_ub_c = rq_dlg_param.rq_c.swath_width_ub; - dpte_groups_per_row_ub_c = rq_dlg_param.rq_c.dpte_groups_per_row_ub; + swath_width_ub_l = rq_dlg_param->rq_l.swath_width_ub; + dpte_groups_per_row_ub_l = rq_dlg_param->rq_l.dpte_groups_per_row_ub; + swath_width_ub_c = rq_dlg_param->rq_c.swath_width_ub; + dpte_groups_per_row_ub_c = rq_dlg_param->rq_c.dpte_groups_per_row_ub; - meta_chunks_per_row_ub_l = rq_dlg_param.rq_l.meta_chunks_per_row_ub; - meta_chunks_per_row_ub_c = rq_dlg_param.rq_c.meta_chunks_per_row_ub; + meta_chunks_per_row_ub_l = rq_dlg_param->rq_l.meta_chunks_per_row_ub; + meta_chunks_per_row_ub_c = rq_dlg_param->rq_c.meta_chunks_per_row_ub; vupdate_offset = dst->vupdate_offset; vupdate_width = dst->vupdate_width; vready_offset = dst->vready_offset; @@ -1379,16 +1379,16 @@ static void dml_rq_dlg_get_dlg_params(struct display_mode_lib *mode_lib, dml_print("DML_DLG: %s: vratio_pre_c=%3.2f\n", __func__, vratio_pre_c); // Active - req_per_swath_ub_l = rq_dlg_param.rq_l.req_per_swath_ub; - req_per_swath_ub_c = rq_dlg_param.rq_c.req_per_swath_ub; - meta_row_height_l = rq_dlg_param.rq_l.meta_row_height; - meta_row_height_c = rq_dlg_param.rq_c.meta_row_height; + req_per_swath_ub_l = rq_dlg_param->rq_l.req_per_swath_ub; + req_per_swath_ub_c = rq_dlg_param->rq_c.req_per_swath_ub; + meta_row_height_l = rq_dlg_param->rq_l.meta_row_height; + meta_row_height_c = rq_dlg_param->rq_c.meta_row_height; swath_width_pixels_ub_l = 0; swath_width_pixels_ub_c = 0; scaler_rec_in_width_l = 0; scaler_rec_in_width_c = 0; - dpte_row_height_l = rq_dlg_param.rq_l.dpte_row_height; - dpte_row_height_c = rq_dlg_param.rq_c.dpte_row_height; + dpte_row_height_l = rq_dlg_param->rq_l.dpte_row_height; + dpte_row_height_c = rq_dlg_param->rq_c.dpte_row_height; if (mode_422) { swath_width_pixels_ub_l = swath_width_ub_l * 2; // *2 for 2 pixel per element @@ -1824,8 +1824,8 @@ static void dml_rq_dlg_get_dlg_params(struct display_mode_lib *mode_lib, disp_ttu_regs->min_ttu_vblank = min_ttu_vblank * refclk_freq_in_mhz; ASSERT(disp_ttu_regs->min_ttu_vblank < dml_pow(2, 24)); - print__ttu_regs_st(mode_lib, *disp_ttu_regs); - print__dlg_regs_st(mode_lib, *disp_dlg_regs); + print__ttu_regs_st(mode_lib, disp_ttu_regs); + print__dlg_regs_st(mode_lib, disp_dlg_regs); } void dml30_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib, @@ -1861,20 +1861,20 @@ void dml30_rq_dlg_get_dlg_reg(struct display_mode_lib *mode_lib, dlg_sys_param.t_srx_delay_us = mode_lib->ip.dcfclk_cstate_latency / dlg_sys_param.deepsleep_dcfclk_mhz; // TODO: Deprecated - print__dlg_sys_params_st(mode_lib, dlg_sys_param); + print__dlg_sys_params_st(mode_lib, &dlg_sys_param); // system parameter calculation done dml_print("DML_DLG: Calculation for pipe[%d] start\n\n", pipe_idx); - dml_rq_dlg_get_rq_params(mode_lib, &rq_param, e2e_pipe_param[pipe_idx].pipe); + dml_rq_dlg_get_rq_params(mode_lib, &rq_param, &e2e_pipe_param[pipe_idx].pipe); dml_rq_dlg_get_dlg_params(mode_lib, e2e_pipe_param, num_pipes, pipe_idx, dlg_regs, ttu_regs, - rq_param.dlg, - dlg_sys_param, + &rq_param.dlg, + &dlg_sys_param, cstate_en, pstate_en, vm_en, diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.h b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.h index c04965cceff3..b40abc41828a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.h +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_rq_dlg_calc_30.h @@ -41,7 +41,7 @@ struct display_mode_lib; // See also: <display_rq_regs_st> void dml30_rq_dlg_get_rq_reg(struct display_mode_lib *mode_lib, display_rq_regs_st *rq_regs, - const display_pipe_params_st pipe_param); + const display_pipe_params_st *pipe_param); // Function: dml_rq_dlg_get_dlg_reg // Calculate and return DLG and TTU register struct given the system setting diff --git a/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.h b/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.h index 2555ef0358c2..fb61a0b1470f 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.h +++ b/drivers/gpu/drm/amd/display/dc/dml/display_rq_dlg_helpers.h @@ -31,16 +31,16 @@ /* Function: Printer functions * Print various struct */ -void print__rq_params_st(struct display_mode_lib *mode_lib, display_rq_params_st rq_param); -void print__data_rq_sizing_params_st(struct display_mode_lib *mode_lib, display_data_rq_sizing_params_st rq_sizing); -void print__data_rq_dlg_params_st(struct display_mode_lib *mode_lib, display_data_rq_dlg_params_st rq_dlg_param); -void print__data_rq_misc_params_st(struct display_mode_lib *mode_lib, display_data_rq_misc_params_st rq_misc_param); -void print__rq_dlg_params_st(struct display_mode_lib *mode_lib, display_rq_dlg_params_st rq_dlg_param); -void print__dlg_sys_params_st(struct display_mode_lib *mode_lib, display_dlg_sys_params_st dlg_sys_param); +void print__rq_params_st(struct display_mode_lib *mode_lib, const display_rq_params_st *rq_param); +void print__data_rq_sizing_params_st(struct display_mode_lib *mode_lib, const display_data_rq_sizing_params_st *rq_sizing); +void print__data_rq_dlg_params_st(struct display_mode_lib *mode_lib, const display_data_rq_dlg_params_st *rq_dlg_param); +void print__data_rq_misc_params_st(struct display_mode_lib *mode_lib, const display_data_rq_misc_params_st *rq_misc_param); +void print__rq_dlg_params_st(struct display_mode_lib *mode_lib, const display_rq_dlg_params_st *rq_dlg_param); +void print__dlg_sys_params_st(struct display_mode_lib *mode_lib, const display_dlg_sys_params_st *dlg_sys_param); -void print__data_rq_regs_st(struct display_mode_lib *mode_lib, display_data_rq_regs_st data_rq_regs); -void print__rq_regs_st(struct display_mode_lib *mode_lib, display_rq_regs_st rq_regs); -void print__dlg_regs_st(struct display_mode_lib *mode_lib, display_dlg_regs_st dlg_regs); -void print__ttu_regs_st(struct display_mode_lib *mode_lib, display_ttu_regs_st ttu_regs); +void print__data_rq_regs_st(struct display_mode_lib *mode_lib, const display_data_rq_regs_st *data_rq_regs); +void print__rq_regs_st(struct display_mode_lib *mode_lib, const display_rq_regs_st *rq_regs); +void print__dlg_regs_st(struct display_mode_lib *mode_lib, const display_dlg_regs_st *dlg_regs); +void print__ttu_regs_st(struct display_mode_lib *mode_lib, const display_ttu_regs_st *ttu_regs); #endif