On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote: > On IVB hardware we are given an interrupt whenever a L3 parity error > occurs in the L3 cache. The L3 cache is used by internal GPU clients > only. This is a very rare occurrence (in fact to test this I need to > use specially instrumented silicon). > > When a row in the L3 cache detects a parity error the HW generates an > interrupt. The interrupt is masked in GTIMR until we get a chance to > read some registers and alert userspace via a uevent. With this > information userspace can use a sysfs interface (follow-up patch) to > remap those rows. > > Way above my level of understanding, but if a given row fails, it is > statistically more likely to fail again than a row which has not failed. > Therefore it is desirable for an operating system to maintain a lifelong > list of failing rows and always remap any bad rows on driver load. > Hardware limits the number of rows that are remappable per bank/subbank, > and should more than that many rows detect parity errors, software > should maintain a list of the most frequent errors, and remap those > rows. > > Signed-off-by: Ben Widawsky <ben at bwidawsk.net> > --- > drivers/gpu/drm/i915/i915_drv.h | 2 + > drivers/gpu/drm/i915/i915_irq.c | 83 +++++++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_reg.h | 17 ++++++++ > 3 files changed, 102 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 69e1539..9505fc0 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -804,6 +804,8 @@ typedef struct drm_i915_private { > > struct drm_property *broadcast_rgb_property; > struct drm_property *force_audio_property; > + > + struct work_struct parity_error_work; > } drm_i915_private_t; > > enum hdmi_force_audio { > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index ab023ca..81e5a7d 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work) > mutex_unlock(&dev_priv->dev->struct_mutex); > } > > + > +/** > + * ivybridge_parity_work - Workqueue called when a parity error interrupt > + * occurred. > + * > + * Doesn't actually do anything except notify userspace so that userspace may > + * disable things later on. > + */ > +static void ivybridge_parity_work(struct work_struct *work) > +{ > + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, > + parity_error_work); > + > + u32 error_status, row, bank, subbank; > + char *parity_event[5]; > + uint32_t misccpctl; > + unsigned long flags; > + > + /* We must turn off DOP level clock gating to access the L3 registers. > + * In order to prevent a get/put style interface, acquire struct mutex > + * any time we access those registers. > + */ > + mutex_lock(&dev_priv->dev->struct_mutex); > + > + misccpctl = I915_READ(GEN7_MISCCPCTL); > + I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); > + POSTING_READ(GEN7_MISCCPCTL); > + > + error_status = I915_READ(GEN7_L3CDERRST1); > + row = GEN7_PARITY_ERROR_ROW(error_status); > + bank = GEN7_PARITY_ERROR_BANK(error_status); > + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); > + > + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | > + GEN7_L3CDERRST1_ENABLE); > + POSTING_READ(GEN7_L3CDERRST1); > + > + I915_WRITE(GEN7_MISCCPCTL, misccpctl); > + > + spin_lock_irqsave(&dev_priv->irq_lock, flags); > + dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; > + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); > + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); > + > + mutex_unlock(&dev_priv->dev->struct_mutex); > + > + parity_event[0] = "L3_PARITY_ERROR=1"; > + parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row); > + parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank); > + parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank); > + parity_event[4] = NULL; > + > + kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj, > + KOBJ_CHANGE, parity_event); > + > + kfree(parity_event[3]); > + kfree(parity_event[2]); > + kfree(parity_event[1]); > +} > + > +void ivybridge_handle_parity_error(struct drm_device *dev) > +{ > + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; > + unsigned long flags; > + > + if (WARN_ON(IS_GEN6(dev))) > + return; > + > + spin_lock_irqsave(&dev_priv->irq_lock, flags); > + dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; > + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); > + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); > + > + queue_work(dev_priv->wq, &dev_priv->parity_error_work); > + DRM_INFO("Parity error interrupt. Scheduling work\n"); > +} > + > static void snb_gt_irq_handler(struct drm_device *dev, > struct drm_i915_private *dev_priv, > u32 gt_iir) > @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev, > DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); > i915_handle_error(dev, false); > } > + > + if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT) > + ivybridge_handle_parity_error(dev); > } > > static void gen6_queue_rps_work(struct drm_i915_private *dev_priv, > @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev) > if (IS_GEN6(dev) || IS_IVYBRIDGE(dev)) > INIT_WORK(&dev_priv->rps_work, gen6_pm_rps_work); > > + if (IS_IVYBRIDGE(dev)) > + INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work); > + work init has moved to intel_irq_init in dinq, and for good reasons as I've figured out after merging the patch: _preinstall is also called on resume, and if we're unlucky we have a work item outstanding from before the suspend, so that the we re-init a life work item. The core work queue code doesn't approve of that, resulting in decent hilarity (NULL deref after suspend). -Daniel > I915_WRITE(HWSTAM, 0xeffe); > > /* XXX hotplug from PCH */ > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 5ac9837..72db6a9 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -4030,6 +4030,23 @@ > #define GEN6_RC6 3 > #define GEN6_RC7 4 > > +#define GEN7_MISCCPCTL (0x9424) > +#define GEN7_DOP_CLOCK_GATE_ENABLE (1<<0) > + > +/* IVYBRIDGE DPF */ > +#define GEN7_L3CDERRST1 0xB008 /* L3CD Error Status 1 */ > +#define GEN7_L3CDERRST1_ROW_MASK (0x7ff<<14) > +#define GEN7_PARITY_ERROR_VALID (1<<13) > +#define GEN7_L3CDERRST1_BANK_MASK (3<<11) > +#define GEN7_L3CDERRST1_SUBBANK_MASK (7<<8) > +#define GEN7_PARITY_ERROR_ROW(reg) \ > + ((reg & GEN7_L3CDERRST1_ROW_MASK) >> 14) > +#define GEN7_PARITY_ERROR_BANK(reg) \ > + ((reg & GEN7_L3CDERRST1_BANK_MASK) >> 11) > +#define GEN7_PARITY_ERROR_SUBBANK(reg) \ > + ((reg & GEN7_L3CDERRST1_SUBBANK_MASK) >> 8) > +#define GEN7_L3CDERRST1_ENABLE (1<<7) > + > #define G4X_AUD_VID_DID 0x62020 > #define INTEL_AUDIO_DEVCL 0x808629FB > #define INTEL_AUDIO_DEVBLC 0x80862801 > -- > 1.7.10 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Mail: daniel at ffwll.ch Mobile: +41 (0)79 365 57 48