On IVB hardware we are given an interrupt whenever a L3 parity error occurs in the L3 cache. The L3 cache is used by internal GPU clients only. This is a very rare occurrence (in fact to test this I need to use specially instrumented silicon). When a row in the L3 cache detects a parity error the HW generates an interrupt. The interrupt is masked in GTIMR until we get a chance to read some registers and alert userspace via a uevent. With this information userspace can use a sysfs interface (follow-up patch) to remap those rows. Way above my level of understanding, but if a given row fails, it is statistically more likely to fail again than a row which has not failed. Therefore it is desirable for an operating system to maintain a lifelong list of failing rows and always remap any bad rows on driver load. Hardware limits the number of rows that are remappable per bank/subbank, and should more than that many rows detect parity errors, software should maintain a list of the most frequent errors, and remap those rows. Signed-off-by: Ben Widawsky <ben at bwidawsk.net> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_irq.c | 83 +++++++++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_reg.h | 17 ++++++++ 3 files changed, 102 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 69e1539..9505fc0 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -804,6 +804,8 @@ typedef struct drm_i915_private { struct drm_property *broadcast_rgb_property; struct drm_property *force_audio_property; + + struct work_struct parity_error_work; } drm_i915_private_t; enum hdmi_force_audio { diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index ab023ca..81e5a7d 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work) mutex_unlock(&dev_priv->dev->struct_mutex); } + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * + * Doesn't actually do anything except notify userspace so that userspace may + * disable things later on. + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. + * In order to prevent a get/put style interface, acquire struct mutex + * any time we access those registers. + */ + mutex_lock(&dev_priv->dev->struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(&dev_priv->irq_lock, flags); + dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); + + mutex_unlock(&dev_priv->dev->struct_mutex); + + parity_event[0] = "L3_PARITY_ERROR=1"; + parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row); + parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank); + parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank); + parity_event[4] = NULL; + + kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj, + KOBJ_CHANGE, parity_event); + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; + unsigned long flags; + + if (WARN_ON(IS_GEN6(dev))) + return; + + spin_lock_irqsave(&dev_priv->irq_lock, flags); + dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); + + queue_work(dev_priv->wq, &dev_priv->parity_error_work); + DRM_INFO("Parity error interrupt. Scheduling work\n"); +} + static void snb_gt_irq_handler(struct drm_device *dev, struct drm_i915_private *dev_priv, u32 gt_iir) @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev, DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); i915_handle_error(dev, false); } + + if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT) + ivybridge_handle_parity_error(dev); } static void gen6_queue_rps_work(struct drm_i915_private *dev_priv, @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev) if (IS_GEN6(dev) || IS_IVYBRIDGE(dev)) INIT_WORK(&dev_priv->rps_work, gen6_pm_rps_work); + if (IS_IVYBRIDGE(dev)) + INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work); + I915_WRITE(HWSTAM, 0xeffe); /* XXX hotplug from PCH */ diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 5ac9837..72db6a9 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -4030,6 +4030,23 @@ #define GEN6_RC6 3 #define GEN6_RC7 4 +#define GEN7_MISCCPCTL (0x9424) +#define GEN7_DOP_CLOCK_GATE_ENABLE (1<<0) + +/* IVYBRIDGE DPF */ +#define GEN7_L3CDERRST1 0xB008 /* L3CD Error Status 1 */ +#define GEN7_L3CDERRST1_ROW_MASK (0x7ff<<14) +#define GEN7_PARITY_ERROR_VALID (1<<13) +#define GEN7_L3CDERRST1_BANK_MASK (3<<11) +#define GEN7_L3CDERRST1_SUBBANK_MASK (7<<8) +#define GEN7_PARITY_ERROR_ROW(reg) \ + ((reg & GEN7_L3CDERRST1_ROW_MASK) >> 14) +#define GEN7_PARITY_ERROR_BANK(reg) \ + ((reg & GEN7_L3CDERRST1_BANK_MASK) >> 11) +#define GEN7_PARITY_ERROR_SUBBANK(reg) \ + ((reg & GEN7_L3CDERRST1_SUBBANK_MASK) >> 8) +#define GEN7_L3CDERRST1_ENABLE (1<<7) + #define G4X_AUD_VID_DID 0x62020 #define INTEL_AUDIO_DEVCL 0x808629FB #define INTEL_AUDIO_DEVBLC 0x80862801 -- 1.7.10