On Mon, 2019-05-13 at 17:16 +0300, Ville Syrjälä wrote: > On Wed, May 08, 2019 at 09:05:06PM +0000, Sripada, Radhakrishna > wrote: > > On Fri, 2019-05-03 at 22:08 +0300, Ville Syrjala wrote: > > > From: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > > > > > > ICL has so many planes that it can easily exceed the maximum > > > effective memory bandwidth of the system. We must therefore check > > > that we don't exceed that limit. > > > > > > The algorithm is very magic number heavy and lacks sufficient > > > explanation for now. We also have no sane way to query the > > > memory clock and timings, so we must rely on a combination of > > > raw readout from the memory controller and hardcoded assumptions. > > > The memory controller values obviously change as the system > > > jumps between the different SAGV points, so we try to stabilize > > > it first by disabling SAGV for the duration of the readout. > > > > > > The utilized bandwidth is tracked via a device wide atomic > > > private object. That is actually not robust because we can't > > > afford to enforce strict global ordering between the pipes. > > > Thus I think I'll need to change this to simply chop up the > > > available bandwidth between all the active pipes. Each pipe > > > can then do whatever it wants as long as it doesn't exceed > > > its budget. That scheme will also require that we assume that > > > any number of planes could be active at any time. > > > > > > TODO: make it robust and deal with all the open questions > > > > > > v2: Sleep longer after disabling SAGV > > > v3: Poll for the dclk to get raised (seen it take 250ms!) > > > If the system has 2133MT/s memory then we pointlessly > > > wait one full second :( > > > v4: Use the new pcode interface to get the qgv points rather > > > that using hardcoded numbers > > > > > > Signed-off-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > > > --- > > > drivers/gpu/drm/i915/Makefile | 1 + > > > drivers/gpu/drm/i915/i915_drv.c | 229 > > > ++++++++++++++++++++++ > > > drivers/gpu/drm/i915/i915_drv.h | 10 + > > > drivers/gpu/drm/i915/i915_reg.h | 3 + > > > drivers/gpu/drm/i915/intel_atomic_plane.c | 20 ++ > > > drivers/gpu/drm/i915/intel_atomic_plane.h | 2 + > > > drivers/gpu/drm/i915/intel_bw.c | 181 > > > +++++++++++++++++ > > > drivers/gpu/drm/i915/intel_bw.h | 46 +++++ > > > drivers/gpu/drm/i915/intel_display.c | 40 +++- > > > drivers/gpu/drm/i915/intel_drv.h | 2 + > > > 10 files changed, 533 insertions(+), 1 deletion(-) > > > create mode 100644 drivers/gpu/drm/i915/intel_bw.c > > > create mode 100644 drivers/gpu/drm/i915/intel_bw.h > > > > > > diff --git a/drivers/gpu/drm/i915/Makefile > > > b/drivers/gpu/drm/i915/Makefile > > > index 68106fe35a04..139a0fc19390 100644 > > > --- a/drivers/gpu/drm/i915/Makefile > > > +++ b/drivers/gpu/drm/i915/Makefile > > > @@ -138,6 +138,7 @@ i915-y += intel_audio.o \ > > > intel_atomic.o \ > > > intel_atomic_plane.o \ > > > intel_bios.o \ > > > + intel_bw.o \ > > > intel_cdclk.o \ > > > intel_color.o \ > > > intel_combo_phy.o \ > > > diff --git a/drivers/gpu/drm/i915/i915_drv.c > > > b/drivers/gpu/drm/i915/i915_drv.c > > > index 5ed864752c7b..b7fa7b51c2e2 100644 > > > --- a/drivers/gpu/drm/i915/i915_drv.c > > > +++ b/drivers/gpu/drm/i915/i915_drv.c > > > @@ -70,6 +70,7 @@ > > > #include "intel_overlay.h" > > > #include "intel_pipe_crc.h" > > > #include "intel_pm.h" > > > +#include "intel_sideband.h" > > > #include "intel_sprite.h" > > > #include "intel_uc.h" > > > > > > @@ -1435,6 +1436,232 @@ bxt_get_dram_info(struct drm_i915_private > > > *dev_priv) > > > return 0; > > > } > > > > > > +struct intel_qgv_point { > > > + u16 dclk, t_rp, t_rdpre, t_rc, t_ras, t_rcd; > > > +}; > > > + > > > +struct intel_sagv_info { > > > + struct intel_qgv_point points[3]; > > > + u8 num_points; > > > + u8 num_channels; > > > + u8 t_bl; > > > + enum intel_dram_type dram_type; > > > +}; > > > + > > > +static int icl_pcode_read_mem_global_info(struct > > > drm_i915_private > > > *dev_priv, > > > + struct intel_sagv_info *si) > > > +{ > > > + u32 val = 0; > > > + int ret; > > > + > > > + ret = sandybridge_pcode_read(dev_priv, > > > + ICL_PCODE_MEM_SUBSYSYSTEM_INFO | > > > + ICL_PCODE_MEM_SS_READ_GLOBAL_INFO, > > > + &val, NULL); > > > + if (ret) > > > + return ret; > > > + > > > + switch (val & 0xf) { > > > + case 0: > > > + si->dram_type = INTEL_DRAM_DDR4; > > > + break; > > > + case 1: > > > + si->dram_type = INTEL_DRAM_DDR3; > > > + break; > > > + case 2: > > > + si->dram_type = INTEL_DRAM_LPDDR3; > > > + break; > > > + case 3: > > > + si->dram_type = INTEL_DRAM_LPDDR3; > > > + break; > > > + default: > > > + MISSING_CASE(val & 0xf); > > > + break; > > > + } > > > + > > > + si->num_channels = (val & 0xf0) >> 4; > > > + si->num_points = (val & 0xf00) >> 8; > > > + > > > + si->t_bl = si->dram_type == INTEL_DRAM_DDR4 ? 4 : 8; > > > + > > > + return 0; > > > +} > > > + > > > +static int icl_pcode_read_qgv_point_info(struct drm_i915_private > > > *dev_priv, > > > + struct intel_qgv_point *sp, > > > + int point) > > Are we trying to retrieve the dram timing parameters to calculate > > the > > latency? If so can that be seperated as latency calculation instead > > of > > using it under bw info below? > > > +{ > > > + u32 val = 0, val2; > > > + int ret; > > > + > > > + ret = sandybridge_pcode_read(dev_priv, > > > + ICL_PCODE_MEM_SUBSYSYSTEM_INFO | > > > + ICL_PCODE_MEM_SS_READ_QGV_POINT_IN > > > FO(point), > > > + &val, &val2); > > > + if (ret) > > > + return ret; > > > + > > > + sp->dclk = val & 0xffff; > > > + sp->t_rp = (val & 0xff0000) >> 16; > > > + sp->t_rcd = (val & 0xff000000) >> 24; > > > + > > > + sp->t_rdpre = val2 & 0xff; > > > + sp->t_ras = (val2 & 0xff00) >> 8; > > > + > > > + sp->t_rc = sp->t_rp + sp->t_ras; > > > + > > > + return 0; > > > +} > > > + > > > +static int icl_get_qgv_points(struct drm_i915_private *dev_priv, > > > + struct intel_sagv_info *si) > > > +{ > > > + int i, ret; > > > + > > > + ret = icl_pcode_read_mem_global_info(dev_priv, si); > > > + if (ret) > > > + return ret; > > > + > > > + if (WARN_ON(si->num_points > ARRAY_SIZE(si->points))) > > > + si->num_points = ARRAY_SIZE(si->points); > > > + > > > + for (i = 0; i < si->num_points; i++) { > > > + struct intel_qgv_point *sp = &si->points[i]; > > > + > > > + ret = icl_pcode_read_qgv_point_info(dev_priv, sp, i); > > > + if (ret) > > > + return ret; > > > + > > > + DRM_DEBUG_KMS("QGV %d: DCLK=%d tRP=%d tRDPRE=%d tRAS=%d > > > tRCD=%d tRC=%d\n", > > > + i, sp->dclk, sp->t_rp, sp->t_rdpre, sp- > > > > t_ras, > > > + sp->t_rcd, sp->t_rc); > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +static int icl_calc_bw(int dclk, int num, int den) > > > +{ > > > + /* multiples of 16.666MHz (100/6) */ > > > + return DIV_ROUND_CLOSEST(num * dclk * 100, den * 6); > > > +} > > > + > > > +static int icl_sagv_max_dclk(const struct intel_sagv_info *si) > > > +{ > > > + u16 dclk = 0; > > > + int i; > > > + > > > + for (i = 0; i < si->num_points; i++) > > > + dclk = max(dclk, si->points[i].dclk); > > > + > > > + return dclk; > > > +} > > > + > > > +struct intel_sa_info { > > > + u8 deburst, mpagesize, deprogbwlimit, displayrtids; > > > +}; > > > + > > > +static const struct intel_sa_info icl_sa_info = { > > > + .deburst = 8, > > > + .mpagesize = 16, > > > + .deprogbwlimit = 25, /* GB/s */ > > > + .displayrtids = 128, > > > +}; > > > + > > > +static int icl_get_bw_info(struct drm_i915_private *dev_priv) > > > +{ > > > + struct intel_sagv_info si = {}; > > > + const struct intel_sa_info *sa = &icl_sa_info; > > > + bool is_y_tile = true; /* assume y tile may be used */ > > > + int num_channels; > > > + int deinterleave; > > > + int ipqdepth, ipqdepthpch; > > > + int dclk_max; > > > + int maxdebw; > > > + int i, ret; > > > + > > > + ret = icl_get_qgv_points(dev_priv, &si); > > > + if (ret) > > > + return ret; > > > + num_channels = si.num_channels; > > > + > > > + deinterleave = DIV_ROUND_UP(num_channels, is_y_tile ? 4 : 2); > > > + dclk_max = icl_sagv_max_dclk(&si); > > > + > > > + ipqdepthpch = 16; > > > + > > > + maxdebw = min(sa->deprogbwlimit * 1000, > > > + icl_calc_bw(dclk_max, 16, 1) * 6 / 10); /* 60% */ > > > + ipqdepth = min(ipqdepthpch, sa->displayrtids / num_channels); > > > + > > > + for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) { > > > + struct intel_bw_info *bi = &dev_priv->max_bw[i]; > > > + int clpchgroup; > > > + int j; > > > + > > > + clpchgroup = (sa->deburst * deinterleave / > > > num_channels) << i; > > > + bi->num_planes = (ipqdepth - clpchgroup) / clpchgroup + > > > 1; > > > + > > > + for (j = 0; j < si.num_points; j++) { > > > + const struct intel_qgv_point *sp = > > > &si.points[j]; > > > + int ct, bw; > > > + > > > + /* > > > + * Max row cycle time > > > + * > > > + * FIXME what is the logic behind the > > > + * assumed burst length? > > > + */ > > > + ct = max_t(int, sp->t_rc, sp->t_rp + sp->t_rcd > > > + > > > + (clpchgroup - 1) * si.t_bl + sp- > > > > t_rdpre); > > For logical flow can we move the above timing related calculations > > to a > > seperate function along with fixme to delink bandwidth and latency > > calculations? > > I don't see what you would want to delink. This is all just > calculating the effective memory bandwidth limit. I am assuming the variables t_rc, t_rp, t_rcd are related to dram latency/timing information. I was wondering if this latency calculation is moved to a seperate inline function for readability purposes. - Radhakrishna(RK) Sripada > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx