On 04/07/17 09:09, Chris Wilson wrote:
> Triggering a GPU reset for one engine affects another, notably
> corrupting the context status buffer (CSB) effectively losing
track of
> inflight requests.
>
> Adding a few printks:
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
b/drivers/gpu/drm/i915/i915_drv.c
> index ad41836fa5e5..a969456bc0fa 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1953,6 +1953,7 @@ int i915_reset_engine(struct
intel_engine_cs *engine)
> goto out;
> }
>
> + pr_err("Resetting %s\n", engine->name);
> ret = intel_gpu_reset(engine->i915,
intel_engine_flag(engine));
> if (ret) {
> /* If we fail here, we expect to fallback to a
global reset */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c
b/drivers/gpu/drm/i915/intel_lrc.c
> index 716e5c9ea222..a72bc35d0870 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -355,6 +355,7 @@ static void execlists_submit_ports(struct
intel_engine_cs *engine)
>
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> port_set(&port[n], port_pack(rq, count));
> desc = execlists_update_context(rq);
> + pr_err("%s: in (rq=%x) ctx=%d\n",
engine->name, rq->global_seqno, upper_32_bits(desc));
> GEM_DEBUG_EXEC(port[n].context_id =
upper_32_bits(desc));
> } else {
> GEM_BUG_ON(!n);
> @@ -594,9 +595,23 @@ static void intel_lrc_irq_handler(unsigned
long data)
> if (!(status &
GEN8_CTX_STATUS_COMPLETED_MASK))
> continue;
>
> + pr_err("%s: out CSB (%x head=%d,
tail=%d), ctx=%d, rq=%d\n",
> + engine->name,
> + readl(csb_mmio),
> + head, tail,
> + readl(buf+2*head+1),
> + port->context_id);
> +
> /* Check the context/desc id for this
event matches */
> - GEM_DEBUG_BUG_ON(readl(buf + 2 * head +
1) !=
> - port->context_id);
> + if (readl(buf + 2 * head + 1) !=
port->context_id) {
> + pr_err("%s: BUG CSB (%x head=%d,
tail=%d), ctx=%d, rq=%d\n",
> + engine->name,
> + readl(csb_mmio),
> + head, tail,
> +
readl(buf+2*head+1),
> + port->context_id);
> + BUG();
> + }
>
> rq = port_unpack(port, &count);
> GEM_BUG_ON(count == 0);
>
> Results in:
>
> [ 6423.006602] Resetting rcs0
> [ 6423.009080] rcs0: in (rq=fffffe70) ctx=1
> [ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3
> [ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3
> [ 6423.009619] Resetting bcs0
> [ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3
>
It took me a while, but I was able to replicate this (with your
igt_reset_active_engines) a couple of times. I also captured the
value
of the CSB events at that point and it looked like this.