> -----Original Message----- > From: Intel-gfx <intel-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Ville > Syrjala > Sent: Wednesday, November 20, 2024 10:11 PM > To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: stable@xxxxxxxxxxxxxxx > Subject: [PATCH 1/4] drm/i915/dsb: Don't use indexed register writes needlessly > > From: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > > Turns out the DSB indexed register write command has rather significant initial > overhead compared to the normal MMIO write command. Based on some quick > experiments on TGL you have to write the register at least ~5 times for the > indexed write command to come out ahead. If you write the register less times > than that the MMIO write is faster. Given the benchmarking results and latency of indexed writes, this change makes sense. Changes look Good to me. Reviewed-by: Uma Shankar <uma.shankar@xxxxxxxxx> > So it seems my automagic indexed write logic was a bit misguided. Go back to the > original approach only use indexed writes for the cases we know will benefit from > it (indexed LUT register updates). > > Currently we shouldn't have any cases where this truly matters (just some rare > double writes to the precision LUT index registers), but we will need to switch the > legacy LUT updates to write each LUT register twice (to avoid some palette anti- > collision logic troubles). > This would be close to the worst case for using indexed writes (two writes per > register, and 256 separate registers). > Using the MMIO write command should shave off around 30% of the execution > time compared to using the indexed write command. > > Cc: stable@xxxxxxxxxxxxxxx > Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates") > Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy > LUT") > Signed-off-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/display/intel_color.c | 51 +++++++++++++--------- > drivers/gpu/drm/i915/display/intel_dsb.c | 19 ++++++-- > drivers/gpu/drm/i915/display/intel_dsb.h | 2 + > 3 files changed, 49 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/i915/display/intel_color.c > b/drivers/gpu/drm/i915/display/intel_color.c > index 174753625bca..6ea3d5c58cb1 100644 > --- a/drivers/gpu/drm/i915/display/intel_color.c > +++ b/drivers/gpu/drm/i915/display/intel_color.c > @@ -1343,6 +1343,17 @@ static void ilk_lut_write(const struct intel_crtc_state > *crtc_state, > intel_de_write_fw(display, reg, val); } > > +static void ilk_lut_write_indexed(const struct intel_crtc_state *crtc_state, > + i915_reg_t reg, u32 val) > +{ > + struct intel_display *display = to_intel_display(crtc_state); > + > + if (crtc_state->dsb_color_vblank) > + intel_dsb_reg_write_indexed(crtc_state->dsb_color_vblank, reg, > val); > + else > + intel_de_write_fw(display, reg, val); } > + > static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state, > const struct drm_property_blob *blob) { @@ -1458,8 > +1469,8 @@ static void bdw_load_lut_10(const struct intel_crtc_state > *crtc_state, > prec_index); > > for (i = 0; i < lut_size; i++) > - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), > - ilk_lut_10(&lut[i])); > + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), > + ilk_lut_10(&lut[i])); > > /* > * Reset the index, otherwise it prevents the legacy palette to be @@ - > 1612,16 +1623,16 @@ static void glk_load_degamma_lut(const struct > intel_crtc_state *crtc_state, > * ToDo: Extend to max 7.0. Enable 32 bit input value > * as compared to just 16 to achieve this. > */ > - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), > - DISPLAY_VER(display) >= 14 ? > - mtl_degamma_lut(&lut[i]) : > glk_degamma_lut(&lut[i])); > + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), > + DISPLAY_VER(display) >= 14 ? > + mtl_degamma_lut(&lut[i]) : > glk_degamma_lut(&lut[i])); > } > > /* Clamp values > 1.0. */ > while (i++ < glk_degamma_lut_size(display)) > - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), > - DISPLAY_VER(display) >= 14 ? > - 1 << 24 : 1 << 16); > + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), > + DISPLAY_VER(display) >= 14 ? > + 1 << 24 : 1 << 16); > > ilk_lut_write(crtc_state, PRE_CSC_GAMC_INDEX(pipe), 0); } @@ - > 1687,10 +1698,10 @@ icl_program_gamma_superfine_segment(const struct > intel_crtc_state *crtc_state) > for (i = 0; i < 9; i++) { > const struct drm_color_lut *entry = &lut[i]; > > - ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), > - ilk_lut_12p4_ldw(entry)); > - ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), > - ilk_lut_12p4_udw(entry)); > + ilk_lut_write_indexed(crtc_state, > PREC_PAL_MULTI_SEG_DATA(pipe), > + ilk_lut_12p4_ldw(entry)); > + ilk_lut_write_indexed(crtc_state, > PREC_PAL_MULTI_SEG_DATA(pipe), > + ilk_lut_12p4_udw(entry)); > } > > ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_INDEX(pipe), @@ - > 1726,10 +1737,10 @@ icl_program_gamma_multi_segment(const struct > intel_crtc_state *crtc_state) > for (i = 1; i < 257; i++) { > entry = &lut[i * 8]; > > - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), > - ilk_lut_12p4_ldw(entry)); > - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), > - ilk_lut_12p4_udw(entry)); > + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), > + ilk_lut_12p4_ldw(entry)); > + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), > + ilk_lut_12p4_udw(entry)); > } > > /* > @@ -1747,10 +1758,10 @@ icl_program_gamma_multi_segment(const struct > intel_crtc_state *crtc_state) > for (i = 0; i < 256; i++) { > entry = &lut[i * 8 * 128]; > > - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), > - ilk_lut_12p4_ldw(entry)); > - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), > - ilk_lut_12p4_udw(entry)); > + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), > + ilk_lut_12p4_ldw(entry)); > + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), > + ilk_lut_12p4_udw(entry)); > } > > ilk_lut_write(crtc_state, PREC_PAL_INDEX(pipe), diff --git > a/drivers/gpu/drm/i915/display/intel_dsb.c > b/drivers/gpu/drm/i915/display/intel_dsb.c > index b7b44399adaa..4d3785f5cb52 100644 > --- a/drivers/gpu/drm/i915/display/intel_dsb.c > +++ b/drivers/gpu/drm/i915/display/intel_dsb.c > @@ -273,16 +273,20 @@ static bool intel_dsb_prev_ins_is_indexed_write(struct > intel_dsb *dsb, i915_reg_ } > > /** > - * intel_dsb_reg_write() - Emit register wriite to the DSB context > + * intel_dsb_reg_write_indexed() - Emit register wriite to the DSB > + context > * @dsb: DSB context > * @reg: register address. > * @val: value. > * > * This function is used for writing register-value pair in command > * buffer of DSB. > + * > + * Note that indexed writes are slower than normal MMIO writes > + * for a small number (less than 5 or so) of writes to the same > + * register. > */ > -void intel_dsb_reg_write(struct intel_dsb *dsb, > - i915_reg_t reg, u32 val) > +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, > + i915_reg_t reg, u32 val) > { > /* > * For example the buffer will look like below for 3 dwords for auto @@ - > 340,6 +344,15 @@ void intel_dsb_reg_write(struct intel_dsb *dsb, > } > } > > +void intel_dsb_reg_write(struct intel_dsb *dsb, > + i915_reg_t reg, u32 val) > +{ > + intel_dsb_emit(dsb, val, > + (DSB_OPCODE_MMIO_WRITE << DSB_OPCODE_SHIFT) | > + (DSB_BYTE_EN << DSB_BYTE_EN_SHIFT) | > + i915_mmio_reg_offset(reg)); > +} > + > static u32 intel_dsb_mask_to_byte_en(u32 mask) { > return (!!(mask & 0xff000000) << 3 | > diff --git a/drivers/gpu/drm/i915/display/intel_dsb.h > b/drivers/gpu/drm/i915/display/intel_dsb.h > index 33e0fc2ab380..da6df07a3c83 100644 > --- a/drivers/gpu/drm/i915/display/intel_dsb.h > +++ b/drivers/gpu/drm/i915/display/intel_dsb.h > @@ -34,6 +34,8 @@ void intel_dsb_finish(struct intel_dsb *dsb); void > intel_dsb_cleanup(struct intel_dsb *dsb); void intel_dsb_reg_write(struct > intel_dsb *dsb, > i915_reg_t reg, u32 val); > +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, > + i915_reg_t reg, u32 val); > void intel_dsb_reg_write_masked(struct intel_dsb *dsb, > i915_reg_t reg, u32 mask, u32 val); void > intel_dsb_noop(struct intel_dsb *dsb, int count); > -- > 2.45.2