> -----Original Message----- > From: Intel-gfx <intel-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Ville Syrjala > Sent: Thursday, October 21, 2021 4:04 AM > To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: [PATCH 4/4] drm/i915: Use unlocked register accesses for LUT > loads > > From: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > > We have to bash in a lot of registers to load the higher precision LUT modes. The > locking overhead is significant, especially as we have to get this done as quickly as > possible during vblank. > So let's switch to unlocked accesses for these. Fortunately the LUT registers are > mostly spread around such that two pipes do not have any registers on the same > cacheline. So as long as commits on the same pipe are serialized (which they are) we > should get away with this without angering the hardware. > > The only exceptions are the PREC_PIPEGCMAX registers on ilk/snb which we don't > use atm as they are only used in the 12bit gamma mode. If/when we add support for > that we may need to remember to still serialize those registers, though I'm not sure > ilk/snb are actually affected by the same cacheline issue. I think ivb/hsw at least > were, but they use a different set of registers for the precision LUT. > > I have a test case which is updating the LUTs on two pipes from a single atomic > commit. Running that in a loop for a minute I get the following worst case with the > locks in place: > intel_crtc_vblank_work_start: pipe B, frame=10037, scanline=1081 > intel_crtc_vblank_work_start: pipe A, frame=12274, scanline=769 > intel_crtc_vblank_work_end: pipe A, frame=12274, scanline=58 > intel_crtc_vblank_work_end: pipe B, frame=10037, scanline=74 > > And here's the worst case with the locks removed: > intel_crtc_vblank_work_start: pipe B, frame=5869, scanline=1081 > intel_crtc_vblank_work_start: pipe A, frame=7616, scanline=769 > intel_crtc_vblank_work_end: pipe B, frame=5869, scanline=1096 > intel_crtc_vblank_work_end: pipe A, frame=7616, scanline=777 > > The test was done on a snb using the 10bit 1024 entry LUT mode. > The vtotals for the two displays are 793 and 1125. So we can see that with the locks > ripped out the LUT updates are pretty nicely confined within the vblank, whereas > with the locks in place we're routinely blasting past the vblank end which causes > visual artifacts near the top of the screen. Unprotected writes should be ok to use in lut updates. Looks good to me. Reviewed-by: Uma Shankar <uma.shankar@xxxxxxxxx> One side query though, what happens when we go for higher refresh rates like 300+, Some cases where vblank period is shrunk to as low as 8us (480Hz RR panels). Regards, Uma Shankar > Signed-off-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/display/intel_color.c | 128 ++++++++++----------- > drivers/gpu/drm/i915/display/intel_dsb.c | 4 +- > 2 files changed, 66 insertions(+), 66 deletions(-) > > diff --git a/drivers/gpu/drm/i915/display/intel_color.c > b/drivers/gpu/drm/i915/display/intel_color.c > index 5359b7305a78..c870a0e50cb1 100644 > --- a/drivers/gpu/drm/i915/display/intel_color.c > +++ b/drivers/gpu/drm/i915/display/intel_color.c > @@ -552,8 +552,8 @@ static void i9xx_load_lut_8(struct intel_crtc *crtc, > lut = blob->data; > > for (i = 0; i < 256; i++) > - intel_de_write(dev_priv, PALETTE(pipe, i), > - i9xx_lut_8(&lut[i])); > + intel_de_write_fw(dev_priv, PALETTE(pipe, i), > + i9xx_lut_8(&lut[i])); > } > > static void i9xx_load_luts(const struct intel_crtc_state *crtc_state) @@ -576,15 > +576,15 @@ static void i965_load_lut_10p6(struct intel_crtc *crtc, > enum pipe pipe = crtc->pipe; > > for (i = 0; i < lut_size - 1; i++) { > - intel_de_write(dev_priv, PALETTE(pipe, 2 * i + 0), > - i965_lut_10p6_ldw(&lut[i])); > - intel_de_write(dev_priv, PALETTE(pipe, 2 * i + 1), > - i965_lut_10p6_udw(&lut[i])); > + intel_de_write_fw(dev_priv, PALETTE(pipe, 2 * i + 0), > + i965_lut_10p6_ldw(&lut[i])); > + intel_de_write_fw(dev_priv, PALETTE(pipe, 2 * i + 1), > + i965_lut_10p6_udw(&lut[i])); > } > > - intel_de_write(dev_priv, PIPEGCMAX(pipe, 0), lut[i].red); > - intel_de_write(dev_priv, PIPEGCMAX(pipe, 1), lut[i].green); > - intel_de_write(dev_priv, PIPEGCMAX(pipe, 2), lut[i].blue); > + intel_de_write_fw(dev_priv, PIPEGCMAX(pipe, 0), lut[i].red); > + intel_de_write_fw(dev_priv, PIPEGCMAX(pipe, 1), lut[i].green); > + intel_de_write_fw(dev_priv, PIPEGCMAX(pipe, 2), lut[i].blue); > } > > static void i965_load_luts(const struct intel_crtc_state *crtc_state) @@ -618,8 > +618,8 @@ static void ilk_load_lut_8(struct intel_crtc *crtc, > lut = blob->data; > > for (i = 0; i < 256; i++) > - intel_de_write(dev_priv, LGC_PALETTE(pipe, i), > - i9xx_lut_8(&lut[i])); > + intel_de_write_fw(dev_priv, LGC_PALETTE(pipe, i), > + i9xx_lut_8(&lut[i])); > } > > static void ilk_load_lut_10(struct intel_crtc *crtc, @@ -631,8 +631,8 @@ static void > ilk_load_lut_10(struct intel_crtc *crtc, > enum pipe pipe = crtc->pipe; > > for (i = 0; i < lut_size; i++) > - intel_de_write(dev_priv, PREC_PALETTE(pipe, i), > - ilk_lut_10(&lut[i])); > + intel_de_write_fw(dev_priv, PREC_PALETTE(pipe, i), > + ilk_lut_10(&lut[i])); > } > > static void ilk_load_luts(const struct intel_crtc_state *crtc_state) @@ -681,16 > +681,16 @@ static void ivb_load_lut_10(struct intel_crtc *crtc, > const struct drm_color_lut *entry = > &lut[i * (lut_size - 1) / (hw_lut_size - 1)]; > > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), prec_index++); > - intel_de_write(dev_priv, PREC_PAL_DATA(pipe), > - ilk_lut_10(entry)); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), prec_index++); > + intel_de_write_fw(dev_priv, PREC_PAL_DATA(pipe), > + ilk_lut_10(entry)); > } > > /* > * Reset the index, otherwise it prevents the legacy palette to be > * written properly. > */ > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), 0); > } > > /* On BDW+ the index auto increment mode actually works */ @@ -704,23 +704,23 > @@ static void bdw_load_lut_10(struct intel_crtc *crtc, > int i, lut_size = drm_color_lut_size(blob); > enum pipe pipe = crtc->pipe; > > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), > - prec_index | PAL_PREC_AUTO_INCREMENT); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), > + prec_index | PAL_PREC_AUTO_INCREMENT); > > for (i = 0; i < hw_lut_size; i++) { > /* We discard half the user entries in split gamma mode */ > const struct drm_color_lut *entry = > &lut[i * (lut_size - 1) / (hw_lut_size - 1)]; > > - intel_de_write(dev_priv, PREC_PAL_DATA(pipe), > - ilk_lut_10(entry)); > + intel_de_write_fw(dev_priv, PREC_PAL_DATA(pipe), > + ilk_lut_10(entry)); > } > > /* > * Reset the index, otherwise it prevents the legacy palette to be > * written properly. > */ > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), 0); > } > > static void ivb_load_lut_ext_max(const struct intel_crtc_state *crtc_state) @@ - > 821,9 +821,9 @@ static void glk_load_degamma_lut(const struct intel_crtc_state > *crtc_state) > * ignore the index bits, so we need to reset it to index 0 > * separately. > */ > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), > - PRE_CSC_GAMC_AUTO_INCREMENT); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), > + PRE_CSC_GAMC_AUTO_INCREMENT); > > for (i = 0; i < lut_size; i++) { > /* > @@ -839,15 +839,15 @@ static void glk_load_degamma_lut(const struct > intel_crtc_state *crtc_state) > * ToDo: Extend to max 7.0. Enable 32 bit input value > * as compared to just 16 to achieve this. > */ > - intel_de_write(dev_priv, PRE_CSC_GAMC_DATA(pipe), > - lut[i].green); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_DATA(pipe), > + lut[i].green); > } > > /* Clamp values > 1.0. */ > while (i++ < 35) > - intel_de_write(dev_priv, PRE_CSC_GAMC_DATA(pipe), 1 << 16); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_DATA(pipe), 1 << 16); > > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > } > > static void glk_load_degamma_lut_linear(const struct intel_crtc_state *crtc_state) > @@ -862,21 +862,21 @@ static void glk_load_degamma_lut_linear(const struct > intel_crtc_state *crtc_stat > * ignore the index bits, so we need to reset it to index 0 > * separately. > */ > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), > - PRE_CSC_GAMC_AUTO_INCREMENT); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), > + PRE_CSC_GAMC_AUTO_INCREMENT); > > for (i = 0; i < lut_size; i++) { > u32 v = (i << 16) / (lut_size - 1); > > - intel_de_write(dev_priv, PRE_CSC_GAMC_DATA(pipe), v); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_DATA(pipe), v); > } > > /* Clamp values > 1.0. */ > while (i++ < 35) > - intel_de_write(dev_priv, PRE_CSC_GAMC_DATA(pipe), 1 << 16); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_DATA(pipe), 1 << 16); > > - intel_de_write(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PRE_CSC_GAMC_INDEX(pipe), 0); > } > > static void glk_load_luts(const struct intel_crtc_state *crtc_state) @@ -1071,10 > +1071,10 @@ static void chv_load_cgm_degamma(struct intel_crtc *crtc, > enum pipe pipe = crtc->pipe; > > for (i = 0; i < lut_size; i++) { > - intel_de_write(dev_priv, CGM_PIPE_DEGAMMA(pipe, i, 0), > - chv_cgm_degamma_ldw(&lut[i])); > - intel_de_write(dev_priv, CGM_PIPE_DEGAMMA(pipe, i, 1), > - chv_cgm_degamma_udw(&lut[i])); > + intel_de_write_fw(dev_priv, CGM_PIPE_DEGAMMA(pipe, i, 0), > + chv_cgm_degamma_ldw(&lut[i])); > + intel_de_write_fw(dev_priv, CGM_PIPE_DEGAMMA(pipe, i, 1), > + chv_cgm_degamma_udw(&lut[i])); > } > } > > @@ -1105,10 +1105,10 @@ static void chv_load_cgm_gamma(struct intel_crtc > *crtc, > enum pipe pipe = crtc->pipe; > > for (i = 0; i < lut_size; i++) { > - intel_de_write(dev_priv, CGM_PIPE_GAMMA(pipe, i, 0), > - chv_cgm_gamma_ldw(&lut[i])); > - intel_de_write(dev_priv, CGM_PIPE_GAMMA(pipe, i, 1), > - chv_cgm_gamma_udw(&lut[i])); > + intel_de_write_fw(dev_priv, CGM_PIPE_GAMMA(pipe, i, 0), > + chv_cgm_gamma_ldw(&lut[i])); > + intel_de_write_fw(dev_priv, CGM_PIPE_GAMMA(pipe, i, 1), > + chv_cgm_gamma_udw(&lut[i])); > } > } > > @@ -1131,8 +1131,8 @@ static void chv_load_luts(const struct intel_crtc_state > *crtc_state) > else > i965_load_luts(crtc_state); > > - intel_de_write(dev_priv, CGM_PIPE_MODE(crtc->pipe), > - crtc_state->cgm_mode); > + intel_de_write_fw(dev_priv, CGM_PIPE_MODE(crtc->pipe), > + crtc_state->cgm_mode); > } > > void intel_color_load_luts(const struct intel_crtc_state *crtc_state) @@ -1808,7 > +1808,7 @@ static struct drm_property_blob *i9xx_read_lut_8(struct intel_crtc > *crtc) > lut = blob->data; > > for (i = 0; i < LEGACY_LUT_LENGTH; i++) { > - u32 val = intel_de_read(dev_priv, PALETTE(pipe, i)); > + u32 val = intel_de_read_fw(dev_priv, PALETTE(pipe, i)); > > i9xx_lut_8_pack(&lut[i], val); > } > @@ -1843,15 +1843,15 @@ static struct drm_property_blob > *i965_read_lut_10p6(struct intel_crtc *crtc) > lut = blob->data; > > for (i = 0; i < lut_size - 1; i++) { > - u32 ldw = intel_de_read(dev_priv, PALETTE(pipe, 2 * i + 0)); > - u32 udw = intel_de_read(dev_priv, PALETTE(pipe, 2 * i + 1)); > + u32 ldw = intel_de_read_fw(dev_priv, PALETTE(pipe, 2 * i + 0)); > + u32 udw = intel_de_read_fw(dev_priv, PALETTE(pipe, 2 * i + 1)); > > i965_lut_10p6_pack(&lut[i], ldw, udw); > } > > - lut[i].red = i965_lut_11p6_max_pack(intel_de_read(dev_priv, > PIPEGCMAX(pipe, 0))); > - lut[i].green = i965_lut_11p6_max_pack(intel_de_read(dev_priv, > PIPEGCMAX(pipe, 1))); > - lut[i].blue = i965_lut_11p6_max_pack(intel_de_read(dev_priv, > PIPEGCMAX(pipe, 2))); > + lut[i].red = i965_lut_11p6_max_pack(intel_de_read_fw(dev_priv, > PIPEGCMAX(pipe, 0))); > + lut[i].green = i965_lut_11p6_max_pack(intel_de_read_fw(dev_priv, > PIPEGCMAX(pipe, 1))); > + lut[i].blue = i965_lut_11p6_max_pack(intel_de_read_fw(dev_priv, > +PIPEGCMAX(pipe, 2))); > > return blob; > } > @@ -1886,8 +1886,8 @@ static struct drm_property_blob > *chv_read_cgm_gamma(struct intel_crtc *crtc) > lut = blob->data; > > for (i = 0; i < lut_size; i++) { > - u32 ldw = intel_de_read(dev_priv, CGM_PIPE_GAMMA(pipe, i, 0)); > - u32 udw = intel_de_read(dev_priv, CGM_PIPE_GAMMA(pipe, i, 1)); > + u32 ldw = intel_de_read_fw(dev_priv, CGM_PIPE_GAMMA(pipe, i, > 0)); > + u32 udw = intel_de_read_fw(dev_priv, CGM_PIPE_GAMMA(pipe, i, > 1)); > > chv_cgm_gamma_pack(&lut[i], ldw, udw); > } > @@ -1922,7 +1922,7 @@ static struct drm_property_blob *ilk_read_lut_8(struct > intel_crtc *crtc) > lut = blob->data; > > for (i = 0; i < LEGACY_LUT_LENGTH; i++) { > - u32 val = intel_de_read(dev_priv, LGC_PALETTE(pipe, i)); > + u32 val = intel_de_read_fw(dev_priv, LGC_PALETTE(pipe, i)); > > i9xx_lut_8_pack(&lut[i], val); > } > @@ -1947,7 +1947,7 @@ static struct drm_property_blob *ilk_read_lut_10(struct > intel_crtc *crtc) > lut = blob->data; > > for (i = 0; i < lut_size; i++) { > - u32 val = intel_de_read(dev_priv, PREC_PALETTE(pipe, i)); > + u32 val = intel_de_read_fw(dev_priv, PREC_PALETTE(pipe, i)); > > ilk_lut_10_pack(&lut[i], val); > } > @@ -1999,16 +1999,16 @@ static struct drm_property_blob > *bdw_read_lut_10(struct intel_crtc *crtc, > > lut = blob->data; > > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), > - prec_index | PAL_PREC_AUTO_INCREMENT); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), > + prec_index | PAL_PREC_AUTO_INCREMENT); > > for (i = 0; i < lut_size; i++) { > - u32 val = intel_de_read(dev_priv, PREC_PAL_DATA(pipe)); > + u32 val = intel_de_read_fw(dev_priv, PREC_PAL_DATA(pipe)); > > ilk_lut_10_pack(&lut[i], val); > } > > - intel_de_write(dev_priv, PREC_PAL_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PREC_PAL_INDEX(pipe), 0); > > return blob; > } > @@ -2050,17 +2050,17 @@ icl_read_lut_multi_segment(struct intel_crtc *crtc) > > lut = blob->data; > > - intel_de_write(dev_priv, PREC_PAL_MULTI_SEG_INDEX(pipe), > - PAL_PREC_AUTO_INCREMENT); > + intel_de_write_fw(dev_priv, PREC_PAL_MULTI_SEG_INDEX(pipe), > + PAL_PREC_AUTO_INCREMENT); > > for (i = 0; i < 9; i++) { > - u32 ldw = intel_de_read(dev_priv, > PREC_PAL_MULTI_SEG_DATA(pipe)); > - u32 udw = intel_de_read(dev_priv, > PREC_PAL_MULTI_SEG_DATA(pipe)); > + u32 ldw = intel_de_read_fw(dev_priv, > PREC_PAL_MULTI_SEG_DATA(pipe)); > + u32 udw = intel_de_read_fw(dev_priv, > PREC_PAL_MULTI_SEG_DATA(pipe)); > > icl_lut_multi_seg_pack(&lut[i], ldw, udw); > } > > - intel_de_write(dev_priv, PREC_PAL_MULTI_SEG_INDEX(pipe), 0); > + intel_de_write_fw(dev_priv, PREC_PAL_MULTI_SEG_INDEX(pipe), 0); > > /* > * FIXME readouts from PAL_PREC_DATA register aren't giving diff --git > a/drivers/gpu/drm/i915/display/intel_dsb.c > b/drivers/gpu/drm/i915/display/intel_dsb.c > index 62a8a69f9f5d..83a69a4a4fea 100644 > --- a/drivers/gpu/drm/i915/display/intel_dsb.c > +++ b/drivers/gpu/drm/i915/display/intel_dsb.c > @@ -100,7 +100,7 @@ void intel_dsb_indexed_reg_write(const struct > intel_crtc_state *crtc_state, > u32 reg_val; > > if (!dsb) { > - intel_de_write(dev_priv, reg, val); > + intel_de_write_fw(dev_priv, reg, val); > return; > } > buf = dsb->cmd_buf; > @@ -177,7 +177,7 @@ void intel_dsb_reg_write(const struct intel_crtc_state > *crtc_state, > > dsb = crtc_state->dsb; > if (!dsb) { > - intel_de_write(dev_priv, reg, val); > + intel_de_write_fw(dev_priv, reg, val); > return; > } > > -- > 2.32.0