On 01/31/2012 07:47 AM, Daniel Vetter wrote: > We have to do this manually. Somebody had a Great Idea. > > I've measured speed-ups just a few percent above the noise level > (below 5% for the best case), but no slowdows. Chris Wilson measured > quite a bit more (10-20% above the usual snb variance) on a more > recent and better tuned version of sna, but also recorded a few > slow-downs on benchmarks know for uglier amounts of snb-induced > variance. > > v2: Incorporate Ben Widawsky's preliminary review comments and > elaborate a bit about the performance impact in the changelog. > > Acked-by: Chris Wilson <chris at chris-wilson.co.uk> > Signed-Off-by: Daniel Vetter <daniel.vetter at ffwll.ch> You didn't address one questions I really cared about, how is it safe to ignore channel 3 size? While I'm at it, I wonder what is in these registers if you have less than 256MB. If the answer is zero, then your check isn't safe enough below. As an aside, this will potentially break our simulation environment, but that's environment fail. > --- > drivers/gpu/drm/i915/i915_dma.c | 2 +- > drivers/gpu/drm/i915/i915_drv.c | 4 ++- > drivers/gpu/drm/i915/i915_drv.h | 3 +- > drivers/gpu/drm/i915/i915_gem.c | 23 +++++++++++++++++++- > drivers/gpu/drm/i915/i915_gem_tiling.c | 16 +++++++++++++- > drivers/gpu/drm/i915/i915_reg.h | 34 ++++++++++++++++++++++++++++++++ > 6 files changed, 75 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c > index 3f27173..dfef956 100644 > --- a/drivers/gpu/drm/i915/i915_dma.c > +++ b/drivers/gpu/drm/i915/i915_dma.c > @@ -1208,7 +1208,7 @@ static int i915_load_gem_init(struct drm_device *dev) > i915_gem_do_init(dev, 0, mappable_size, gtt_size - PAGE_SIZE); > > mutex_lock(&dev->struct_mutex); > - ret = i915_gem_init_ringbuffer(dev); > + ret = i915_gem_init_hw(dev); > mutex_unlock(&dev->struct_mutex); > if (ret) > return ret; > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index 1658cfd..12ddf47 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -495,7 +495,7 @@ static int i915_drm_thaw(struct drm_device *dev) > mutex_lock(&dev->struct_mutex); > dev_priv->mm.suspended = 0; > > - error = i915_gem_init_ringbuffer(dev); > + error = i915_gem_init_hw(dev); > mutex_unlock(&dev->struct_mutex); > > if (HAS_PCH_SPLIT(dev)) > @@ -686,6 +686,8 @@ int i915_reset(struct drm_device *dev, u8 flags) > !dev_priv->mm.suspended) { > dev_priv->mm.suspended = 0; > > + i915_gem_init_swizzling(dev); > + > dev_priv->ring[RCS].init(&dev_priv->ring[RCS]); > if (HAS_BSD(dev)) > dev_priv->ring[VCS].init(&dev_priv->ring[VCS]); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 865de80..0845419 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1187,7 +1187,8 @@ int __must_check i915_gem_object_set_domain(struct drm_i915_gem_object *obj, > uint32_t read_domains, > uint32_t write_domain); > int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj); > -int __must_check i915_gem_init_ringbuffer(struct drm_device *dev); > +int __must_check i915_gem_init_hw(struct drm_device *dev); > +void i915_gem_init_swizzling(struct drm_device *dev); > void i915_gem_cleanup_ringbuffer(struct drm_device *dev); > void i915_gem_do_init(struct drm_device *dev, > unsigned long start, > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 51a2b0c..86fffd2 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3681,12 +3681,31 @@ i915_gem_idle(struct drm_device *dev) > return 0; > } > > +void i915_gem_init_swizzling(struct drm_device *dev) > +{ > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + if (INTEL_INFO(dev)->gen < 6 || > + dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE) > + return; > + > + I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) | > + DISP_TILE_SURFACE_SWIZZLING); > + > + I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL); > + if (IS_GEN6(dev)) > + I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_SNB)); > + else > + I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_IVB)); > +} > int > -i915_gem_init_ringbuffer(struct drm_device *dev) > +i915_gem_init_hw(struct drm_device *dev) > { > drm_i915_private_t *dev_priv = dev->dev_private; > int ret; > > + i915_gem_init_swizzling(dev); > + > ret = intel_init_render_ring_buffer(dev); > if (ret) > return ret; > @@ -3742,7 +3761,7 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data, > mutex_lock(&dev->struct_mutex); > dev_priv->mm.suspended = 0; > > - ret = i915_gem_init_ringbuffer(dev); > + ret = i915_gem_init_hw(dev); > if (ret != 0) { > mutex_unlock(&dev->struct_mutex); > return ret; > diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c > index 861223b..acf89fe 100644 > --- a/drivers/gpu/drm/i915/i915_gem_tiling.c > +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c > @@ -93,8 +93,20 @@ i915_gem_detect_bit_6_swizzle(struct drm_device *dev) > uint32_t swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; > > if (INTEL_INFO(dev)->gen >= 6) { > - swizzle_x = I915_BIT_6_SWIZZLE_NONE; > - swizzle_y = I915_BIT_6_SWIZZLE_NONE; > + uint32_t dimm_c0, dimm_c1; > + dimm_c0 = I915_READ(MAD_DIMM_C0); > + dimm_c1 = I915_READ(MAD_DIMM_C1); > + dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; > + dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; > + /* Enable swizzling when the channels are populated with > + * identically sized dimms. */ > + if (dimm_c0 == dimm_c1) { > + swizzle_x = I915_BIT_6_SWIZZLE_9_10; > + swizzle_y = I915_BIT_6_SWIZZLE_9; > + } else { > + swizzle_x = I915_BIT_6_SWIZZLE_NONE; > + swizzle_y = I915_BIT_6_SWIZZLE_NONE; > + } > } else if (IS_GEN5(dev)) { > /* On Ironlake whatever DRAM config, GPU always do > * same swizzling setup. > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index f960738..539ef90 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -295,6 +295,12 @@ > #define FENCE_REG_SANDYBRIDGE_0 0x100000 > #define SANDYBRIDGE_FENCE_PITCH_SHIFT 32 > > +/* control register for cpu gtt access */ > +#define TILECTL 0x101000 > +#define TILECTL_SWZCTL (1 << 0) > +#define TILECTL_TLB_PREFETCH_DIS (1 << 2) > +#define TILECTL_BACKSNOOP_DIS (1 << 3) > + > /* > * Instruction and interrupt control regs > */ > @@ -318,6 +324,11 @@ > #define RING_MAX_IDLE(base) ((base)+0x54) > #define RING_HWS_PGA(base) ((base)+0x80) > #define RING_HWS_PGA_GEN6(base) ((base)+0x2080) > +#define ARB_MODE 0x04030 > +#define ARB_MODE_SWIZZLE_SNB (1<<4) > +#define ARB_MODE_SWIZZLE_IVB (1<<5) > +#define ARB_MODE_ENABLE(x) GFX_MODE_ENABLE(x) > +#define ARB_MODE_DISABLE(x) GFX_MODE_DISABLE(x) > #define RENDER_HWS_PGA_GEN7 (0x04080) > #define RING_FAULT_REG(ring) (0x4094 + 0x100*(ring)->id) > #define DONE_REG 0x40b0 > @@ -1037,6 +1048,29 @@ > #define C0DRB3 0x10206 > #define C1DRB3 0x10606 > > +/** snb MCH registers for reading the DRAM channel configuration */ > +#define MAD_DIMM_C0 (MCHBAR_MIRROR_BASE_SNB + 0x5004) > +#define MAD_DIMM_C1 (MCHBAR_MIRROR_BASE_SNB + 0x5008) > +#define MAD_DIMM_C2 (MCHBAR_MIRROR_BASE_SNB + 0x500C) > +#define MAD_DIMM_ECC_MASK (0x3 << 24) > +#define MAD_DIMM_ECC_OFF (0x0 << 24) > +#define MAD_DIMM_ECC_IO_ON_LOGIC_OFF (0x1 << 24) > +#define MAD_DIMM_ECC_IO_OFF_LOGIC_ON (0x2 << 24) > +#define MAD_DIMM_ECC_ON (0x3 << 24) > +#define MAD_DIMM_ENH_INTERLEAVE (0x1 << 22) > +#define MAD_DIMM_RANK_INTERLEAVE (0x1 << 21) > +#define MAD_DIMM_B_WIDTH_X16 (0x1 << 20) /* X8 chips if unset */ > +#define MAD_DIMM_A_WIDTH_X16 (0x1 << 19) /* X8 chips if unset */ > +#define MAD_DIMM_B_DUAL_RANK (0x1 << 18) > +#define MAD_DIMM_A_DUAL_RANK (0x1 << 17) > +#define MAD_DIMM_A_SELECT (0x1 << 16) > +/* DIMM sizes are in multiples of 256mb. */ > +#define MAD_DIMM_B_SIZE_SHIFT 8 > +#define MAD_DIMM_B_SIZE_MASK (0xff << MAD_DIMM_B_SIZE_SHIFT) > +#define MAD_DIMM_A_SIZE_SHIFT 0 > +#define MAD_DIMM_A_SIZE_MASK (0xff << MAD_DIMM_A_SIZE_SHIFT) > + > + Is that a whitespace error for MAD_DIMM_C1 and MAD_DIMM_C2? > /* Clocking configuration register */ > #define CLKCFG 0x10c00 > #define CLKCFG_FSB_400 (5 << 0) /* hrawclk 100 */