Added debugfs functions and embedded test infrastructure in the context event interrupt handler for simulating the loss of context event interrupts so that a context submission state inconsistency can be induced. This is useful for testing the consistency checker pre-stage to the engine hang recovery path since in order to test that the inconsistency detection works we first need to induce a state inconsistency that the inconsistency checker can detect and act upon. Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 88 +++++++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_dma.c | 2 + drivers/gpu/drm/i915/i915_drv.c | 3 ++ drivers/gpu/drm/i915/i915_drv.h | 12 +++++ drivers/gpu/drm/i915/intel_lrc.c | 68 ++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index edb79a7..233088e 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4667,6 +4667,93 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, "%llu\n"); static int +i915_fake_ctx_submission_inconsistency_get(void *data, u64 *val) +{ + struct drm_device *dev = data; + struct drm_i915_private *dev_priv = dev->dev_private; + struct intel_engine_cs *ring; + unsigned i; + + DRM_INFO("Faked inconsistent context submission state: %x\n", + dev_priv->gpu_error.faked_lost_ctx_event_irq); + + for_each_ring(ring, dev_priv, i) { + u32 fake_cnt = + (dev_priv->gpu_error.faked_lost_ctx_event_irq >> (i<<2)) & 0xf; + + DRM_INFO("%s: Faking %s [%u IRQs left to drop]\n", + ring->name, + fake_cnt?"enabled":"disabled", + fake_cnt); + } + + *val = (u64) dev_priv->gpu_error.faked_lost_ctx_event_irq; + + return 0; +} + +static int +i915_fake_ctx_submission_inconsistency_set(void *data, u64 val) +{ + struct drm_device *dev = data; + struct drm_i915_private *dev_priv = dev->dev_private; + u32 fake_status; + + /* + * Set up a simulated/faked lost context event interrupt. This is used + * to induce inconsistent HW/driver states that the context submission + * status consistency checker (involved as a pre-stage to GPU engine + * hang recovery), which is required for validation purposes. + * + * val contains the new faked_lost_ctx_event_irq word that is to be + * merged with the already set faked_lost_ctx_event_irq word. + * + * val == 0 means clear all previously set fake bits. + * + * Each nibble contains a number between 0-15 denoting the number of + * interrupts left to lose on the engine that nibble corresponds to. + * + * RCS: faked_lost_ctx_event_irq[3:0] + * VCS: faked_lost_ctx_event_irq[7:4] + * BCS: faked_lost_ctx_event_irq[11:8] + * VECS: faked_lost_ctx_event_irq[15:12] + * etc + * + * The number in each nibble is decremented by the context event + * interrupt handler in intel_lrc.c once the faked interrupt loss is + * executed. If a targetted interrupt is received when bit + * corresponding to that engine is set that interrupt will be dropped + * without side-effects, thus inducing an inconsistency since the + * hardware has entered a state where removal of a context from the + * context queue is required but the driver is not informed of this and + * is therefore stuck in that state until inconsistency rectification + * (forced CSB checking) or reboot. + */ + + fake_status = + dev_priv->gpu_error.faked_lost_ctx_event_irq; + + DRM_INFO("Faking lost context event IRQ (new status: %x, old status: %x)\n", + (u32) val, fake_status); + + if (val) { + dev_priv->gpu_error.faked_lost_ctx_event_irq |= ((u32) val); + } else { + DRM_INFO("Clearing lost context event IRQ mask\n"); + + dev_priv->gpu_error.faked_lost_ctx_event_irq = 0; + } + + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_fake_ctx_submission_inconsistency_fops, + i915_fake_ctx_submission_inconsistency_get, + i915_fake_ctx_submission_inconsistency_set, + "%llu\n"); + +static int i915_ring_stop_get(void *data, u64 *val) { struct drm_device *dev = data; @@ -5320,6 +5407,7 @@ static const struct i915_debugfs_files { const struct file_operations *fops; } i915_debugfs_files[] = { {"i915_wedged", &i915_wedged_fops}, + {"i915_fake_ctx_inconsistency", &i915_fake_ctx_submission_inconsistency_fops}, {"i915_max_freq", &i915_max_freq_fops}, {"i915_min_freq", &i915_min_freq_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 1e203e7d..e8193c9 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -843,6 +843,8 @@ i915_hangcheck_init(struct drm_device *dev) int i; struct drm_i915_private *dev_priv = dev->dev_private; + dev_priv->gpu_error.faked_lost_ctx_event_irq = 0; + for (i = 0; i < I915_NUM_RINGS; i++) { struct intel_engine_cs *engine = &dev_priv->ring[i]; struct intel_ring_hangcheck *hc = &engine->hangcheck; diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c361b19..c32c475 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -917,6 +917,9 @@ int i915_reset(struct drm_device *dev) } } + /* Clear simulated lost context event interrupts */ + dev_priv->gpu_error.faked_lost_ctx_event_irq = 0; + if (i915_stop_ring_allow_warn(dev_priv)) pr_notice("drm/i915: Resetting chip after gpu hang\n"); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9219904..7ebf800 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1400,6 +1400,18 @@ struct i915_gpu_error { #define I915_STOP_RING_ALLOW_BAN (1 << 31) #define I915_STOP_RING_ALLOW_WARN (1 << 30) + /* + * Bit mask for simulation of lost context event IRQs on each + * respective engine. + * + * Bits 0:3: Number of lost IRQs to be faked on RCS + * Bits 4:7: Number of lost IRQs to be faked on VCS + * Bits 8:11: Number of lost IRQs to be faked on BCS + * Bits 12:15: Number of lost IRQs to be faked on VECS + * Bits 16:19: Number of lost IRQs to be faked on VCS2 + */ + u32 faked_lost_ctx_event_irq; + /* For missed irq/seqno simulation. */ unsigned int test_irq_rings; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index b48f74c..5bb7d6e 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -700,6 +700,52 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring, } /** + * fake_lost_ctx_event_irq() - Checks for pending faked lost context event IRQs. + * @dev_priv: ... + * @ring: Engine to check pending faked lost IRQs for. + * + * Checks the bits in dev_priv->gpu_error.faked_lost_ctx_event_irq corresponding + * to the specified engine and updates the bits and returns a value accordingly. + * + * Return: + * true: If the current IRQ is to be lost. + * false: If the current IRQ is to be processed as normal. + */ +static inline bool fake_lost_ctx_event_irq(struct drm_i915_private *dev_priv, + struct intel_engine_cs *ring) +{ + u32 *faked_lost_irq_mask = + &dev_priv->gpu_error.faked_lost_ctx_event_irq; + + /* + * Point out the least significant bit in the nibble of the faked lost + * context event IRQ mask that corresponds to the engine at hand. + */ + u32 engine_nibble = (ring->id << 2); + + /* Check engine nibble for any pending IRQs to be simulated as lost */ + if (*faked_lost_irq_mask & (0xf << engine_nibble)) { + DRM_INFO("Faked lost interrupt on %s! (%x)\n", + ring->name, + *faked_lost_irq_mask); + + /* + * Subtract the IRQ that is to be simulated as lost from the + * engine nibble. + */ + *faked_lost_irq_mask -= (0x1 << engine_nibble); + + DRM_INFO("New fake lost irq mask: %x\n", + *faked_lost_irq_mask); + + /* Tell the IRQ handler to simulate lost context event IRQ */ + return true; + } + + return false; +} + +/** * intel_lrc_irq_handler() - handle Context Switch interrupts * @ring: Engine Command Streamer to handle. * @do_lock: Lock execlist spinlock (if false the caller is responsible for this) @@ -740,6 +786,23 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock) if (status & GEN8_CTX_STATUS_PREEMPTED) { if (status & GEN8_CTX_STATUS_LITE_RESTORE) { + if (fake_lost_ctx_event_irq(dev_priv, ring)) { + /* + * If we want to simulate the loss of a + * context event IRQ (only for such events + * that could affect the execlist queue, + * since this is something that could + * affect the context submission status + * consistency checker) then just exit the + * IRQ handler early with no side-effects! + * We want to pretend like this IRQ never + * happened. The next time the IRQ handler + * is entered for this engine the CSB + * events should remain in the CSB, waiting + * to be processed. + */ + goto exit; + } if (execlists_check_remove_request(ring, status_id)) WARN(1, "Lite Restored request removed from queue\n"); } else @@ -748,6 +811,10 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock) if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) || (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) { + + if (fake_lost_ctx_event_irq(dev_priv, ring)) + goto exit; + if (execlists_check_remove_request(ring, status_id)) submit_contexts++; } @@ -770,6 +837,7 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock) ((u32)ring->next_context_status_buffer & GEN8_CSB_PTR_MASK) << 8)); +exit: if (do_lock) spin_unlock(&ring->execlist_lock); -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx