Re: [PATCH 2/2] drm/i915/bxt: work around HW context corruption due to coherency problem

Imre Deak <imre.deak@xxxxxxxxx> · Thu, 17 Sep 2015 19:16:34 +0300

On ti, 2015-09-15 at 21:30 +0300, Imre Deak wrote:
> The execlist context object is mapped with a CPU/GPU coherent mapping
> everywhere, but on BXT A stepping due to a HW issue the coherency is not
> guaranteed. To work around this flush the CPU cache after any change
> from the CPU to the context object. Note that this also includes any
> changes done by the VM core as opposed to the driver, when
> reading from backing store/bzeroing the pages.
> 
> I noticed this problem via a GPU hang, where IPEHR pointed to an invalid
> opcode value. I couldn't find this value on the ring but looking at the
> contents of the active context object it turned out to be a parameter
> dword of a bigger command there. The original command opcode itself
> was zeroed out, based on the above I assume due to a CPU writeback of
> the corresponding cacheline. When restoring the context the GPU would
> jump over the zeroed out opcode and hang when trying to execute the
> above parameter dword.
> 
> I could easily reproduce this by running igt/gem_render_copy_redux and
> gem_tiled_blits/basic in parallel, but I guess it could be triggered by
> anything involving frequent switches between two separate contexts. With
> this workaround I couldn't reproduce the problem.
> 
> Note that I also considered using set_pages_array_uc/wc on the context
> object but this wouldn't work with kmap_atomic which always returns a WB
> mapping, at least on HIGHMEM. The alternative would be keeping a UC/WC
> kernel mapping around whenever the context object is pinned, but this
> would be a bigger change. Since I'm not sure if there would be any
> benefit in using set_pages_array, I chose the simpler clflush method.

Ville noticed a race condition with the above approach when updating the
context during context switch: it's possible that the GPU writes the
updated ring head value at the same time the driver updates the tail
value. In this case the driver could corrupt the updated head value when
doing the clflush, since head and tail are in the same cacheline. So
instead we can map the corresponding page uncached avoiding this
scenario. I'll follow up with a v2 patchset doing this.

> Signed-off-by: Imre Deak <imre.deak@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_drv.h  |  2 ++
>  drivers/gpu/drm/i915/i915_gem.c  |  2 +-
>  drivers/gpu/drm/i915/intel_lrc.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4ea3e7b..ca45269 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3103,6 +3103,8 @@ i915_gem_object_ggtt_unpin(struct drm_i915_gem_object *obj)
>  	i915_gem_object_ggtt_unpin_view(obj, &i915_ggtt_view_normal);
>  }
>  
> +bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj);
> +
>  /* i915_gem_fence.c */
>  int __must_check i915_gem_object_get_fence(struct drm_i915_gem_object *obj);
>  int __must_check i915_gem_object_put_fence(struct drm_i915_gem_object *obj);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index cb0df7e..4543124 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -53,7 +53,7 @@ static bool cpu_cache_is_coherent(struct drm_device *dev,
>  	return HAS_LLC(dev) || level != I915_CACHE_NONE;
>  }
>  
> -static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> +bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>  {
>  	if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
>  		return true;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3f18ea1..de866be 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -362,6 +362,8 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
>  	struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj;
>  	struct page *page;
>  	uint32_t *reg_state;
> +	void *clflush_start;
> +	void *clflush_end;
>  
>  	BUG_ON(!ctx_obj);
>  	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
> @@ -373,6 +375,9 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
>  	reg_state[CTX_RING_TAIL+1] = rq->tail;
>  	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
>  
> +	clflush_start = &reg_state[CTX_RING_TAIL + 1];
> +	clflush_end = &reg_state[CTX_RING_BUFFER_START + 2];
> +
>  	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
>  		/* True 32b PPGTT with dynamic page allocation: update PDP
>  		 * registers and point the unallocated PDPs to scratch page.
> @@ -383,8 +388,14 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
>  		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
>  		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
>  		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> +
> +		clflush_end = &reg_state[CTX_PDP0_LDW + 2];
>  	}
>  
> +	if (cpu_write_needs_clflush(ctx_obj))
> +		drm_clflush_virt_range(clflush_start,
> +				       clflush_end - clflush_start);
> +
>  	kunmap_atomic(reg_state);
>  
>  	return 0;
> @@ -1036,6 +1047,13 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
>  	if (ret)
>  		return ret;
>  
> +	/*
> +	 * For a non-coherent mapping we need to flush any related CPU cache
> +	 * lines, otherwise the writeback of these would corrupt the data
> +	 * written by the GPU.
> +	 */
> +	i915_gem_clflush_object(ctx_obj, false);
> +
>  	ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
>  	if (ret)
>  		goto unpin_ctx_obj;

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx