Now that Chris Wilson demonstrated that the key for stability on ealry gen 2 is to simple _never_ exchange the physical backing storage of batch buffers I've tried a stab at a kernel solution. Doesn't look too nefarious imho, now that I don't try to be too clever for my own good any more. Open issue is how to coordinate the workaround between the kernel and SNA, since that just pins a set of batches for the same effect: a) Add a flag to the kernel, disable the w/a in SNA when it's set. b) Check in the kernel if the batch bo is pinned, if so userspace has the w/a already and we don't need to copy things again - a bit ugly to wire up with the current approach, since the function implement the batch w/a is far away from where we can detect whether the batch is pinned by userspace already. c) Don't care, blitting batches isn't too expensive. Signed-Off-by: Daniel Vetter <daniel.vetter at ffwll.ch> --- drivers/gpu/drm/i915/i915_drv.h | 3 ++ drivers/gpu/drm/i915/i915_irq.c | 6 ++++ drivers/gpu/drm/i915/intel_ringbuffer.c | 58 ++++++++++++++++++++++++++++++--- 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 10ebd51..3d2a78b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1233,6 +1233,9 @@ struct drm_i915_file_private { #define HAS_OVERLAY(dev) (INTEL_INFO(dev)->has_overlay) #define OVERLAY_NEEDS_PHYSICAL(dev) (INTEL_INFO(dev)->overlay_needs_physical) +/* Early gen2 have a totally busted CS tlb and require pinned batches. */ +#define HAS_BROKEN_CS_TLB(dev) (IS_I830(dev) || IS_845G(dev)) + /* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte * rows, which changed the alignment requirements and fence programming. */ diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 1d40f77..7d4ea0b 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1119,6 +1119,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv, if (!ring->get_seqno) return NULL; + if (HAS_BROKEN_CS_TLB(dev_priv->dev)) { + obj = ring->private; + + return i915_error_object_create(dev_priv, obj); + } + seqno = ring->get_seqno(ring, false); list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list) { if (obj->ring != ring) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index fdc6334..fb7c7b5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -547,9 +547,17 @@ static int init_render_ring(struct intel_ring_buffer *ring) static void render_ring_cleanup(struct intel_ring_buffer *ring) { + struct drm_device *dev = ring->dev; + if (!ring->private) return; + if (HAS_BROKEN_CS_TLB(dev)) { + struct drm_i915_gem_object *obj = ring->private; + + drm_gem_object_unreference(&obj->base); + } + cleanup_pipe_control(ring); } @@ -985,20 +993,41 @@ i965_dispatch_execbuffer(struct intel_ring_buffer *ring, return 0; } +/* Just userspace ABI convention to limit the wa batch bo to a resonable size */ +#define I830_BATCH_LIMIT (256*1024) static int i830_dispatch_execbuffer(struct intel_ring_buffer *ring, u32 offset, u32 len, unsigned flags) { + struct drm_i915_gem_object *obj = ring->private; + u32 cs_offset = obj->gtt_offset; int ret; - ret = intel_ring_begin(ring, 4); + if (len > I830_BATCH_LIMIT) + return -ENOSPC; + + ret = intel_ring_begin(ring, 8+4); if (ret) return ret; + /* Blit the batch (which has now all relocs applied) to the stable batch + * scratch bo area (so that the CS never stumbles over its tlb + * invalidation bug) ... */ + intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | + XY_SRC_COPY_BLT_WRITE_ALPHA | + XY_SRC_COPY_BLT_WRITE_RGB); + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024); + intel_ring_emit(ring, cs_offset); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 4096); + intel_ring_emit(ring, offset); + /* ... and execute it. */ intel_ring_emit(ring, MI_BATCH_BUFFER); - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); - intel_ring_emit(ring, offset + len - 8); + intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); + intel_ring_emit(ring, cs_offset + len - 8); intel_ring_emit(ring, 0); intel_ring_advance(ring); @@ -1168,7 +1197,7 @@ static int intel_init_ring_buffer(struct drm_device *dev, * of the buffer. */ ring->effective_size = ring->size; - if (IS_I830(ring->dev) || IS_845G(ring->dev)) + if (HAS_BROKEN_CS_TLB(ring->dev)) ring->effective_size -= 128; return 0; @@ -1645,6 +1674,27 @@ int intel_init_render_ring_buffer(struct drm_device *dev) ring->init = init_render_ring; ring->cleanup = render_ring_cleanup; + /* Workaround batchbuffer to combat CS tlb bug. */ + if (HAS_BROKEN_CS_TLB(dev)) { + struct drm_i915_gem_object *obj; + int ret; + + obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT); + if (obj == NULL) { + DRM_ERROR("Failed to allocate batch bo\n"); + return -ENOMEM; + } + + ret = i915_gem_object_pin(obj, 4096, true, false); + if (ret != 0) { + drm_gem_object_unreference(&obj->base); + DRM_ERROR("Failed to ping batch bo\n"); + return ret; + } + + ring->private = obj; + } + return intel_init_ring_buffer(dev, ring); } -- 1.7.11.4