>From 0f1092b19d66cfe803b22f9e982c5e5be5a5ff23 Mon Sep 17 00:00:00 2001 Message-Id: <0f1092b19d66cfe803b22f9e982c5e5be5a5ff23.1387201899.git.ian.lister@xxxxxxxxx> In-Reply-To: <cover.1387201899.git.ian.lister@xxxxxxxxx> References: <cover.1387201899.git.ian.lister@xxxxxxxxx> From: ian-lister <ian.lister@xxxxxxxxx> Date: Tue, 10 Dec 2013 11:53:25 +0000 Subject: [RFC 06/13] drm/i915: Communicating reset requests Per-engine recovery requires us to pass the hung ring information through to i915_error_work_func. The reset_counter was sufficient on its own when we were just dealing with global reset, but for per-engine reset we need to signal reset requests for each engine. The reset_counter offers a very tidy way of communicating to the rest of the driver that a reset is in-progress or that a reset has happened since the count was last checked, without using locks. Ideally we want to keep this mechanism. To resolve this, the reset_counter is now shared by global reset and per-engine reset. It is an indication that some sort of recovery work is in-progress. A new variable "reset_requests" has been added to pass the specific reset details from i915_handle_error to i915_error_work_func. Both of these function take gpu_error.lock whilst modifiying these variables which guarantees that the I915_RESET_IN_PROGRESS flag remains set whilstever gpu_error.reset_requests is non-zero. This retains the best of the original scheme as the rest of the driver can remain lockless when checking the reset_counter. This patch is part of a set and i915_error_work_func doesn't yet attempt to do per-engine reset recovery (it will convert all requests into global resets). Signed-off-by: ian-lister <ian.lister@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 57 +++++++++++ drivers/gpu/drm/i915/i915_drv.h | 11 +++ drivers/gpu/drm/i915/i915_gem.c | 3 +- drivers/gpu/drm/i915/i915_irq.c | 166 +++++++++++++++++++++++--------- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 + drivers/gpu/drm/i915/intel_uncore.c | 22 ++++- 6 files changed, 210 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 311c647..f00b19c 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -896,6 +896,62 @@ static const struct file_operations i915_error_state_fops = { .release = i915_error_state_release, }; +static ssize_t +i915_ring_hangcheck_read(struct file *filp, + char __user *ubuf, + size_t max, + loff_t *ppos) +{ + /* Returns the total number of times the rings + * have hung and been reset since boot */ + struct drm_device *dev = filp->private_data; + drm_i915_private_t *dev_priv = dev->dev_private; + char buf[100]; + int len; + + len = scnprintf(buf, sizeof(buf), + "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n", + dev_priv->gpu_error.global_resets, + dev_priv->ring[RCS].hangcheck.total, + dev_priv->ring[VCS].hangcheck.total, + dev_priv->ring[BCS].hangcheck.total); + + return simple_read_from_buffer(ubuf, max, ppos, buf, len); +} + +static ssize_t +i915_ring_hangcheck_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + struct drm_device *dev = filp->private_data; + struct drm_i915_private *dev_priv = dev->dev_private; + uint32_t i; + unsigned long irqflags; + + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + + for (i = 0; i < I915_NUM_RINGS; i++) { + /* Reset the hangcheck counters */ + dev_priv->ring[i].hangcheck.total = 0; + } + + dev_priv->gpu_error.global_resets = 0; + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + + return cnt; +} + +static const struct file_operations i915_ring_hangcheck_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = i915_ring_hangcheck_read, + .write = i915_ring_hangcheck_write, + .llseek = default_llseek, +}; + static int i915_next_seqno_get(void *data, u64 *val) { @@ -3139,6 +3195,7 @@ static const struct i915_debugfs_files { {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, {"i915_ring_test_irq", &i915_ring_test_irq_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, + {"i915_ring_hangcheck", &i915_ring_hangcheck_fops}, {"i915_error_state", &i915_error_state_fops}, {"i915_next_seqno", &i915_next_seqno_fops}, {"i915_display_crc_ctl", &i915_display_crc_ctl_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0d8805e..1087e4e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1132,6 +1132,13 @@ struct i915_gpu_error { */ atomic_t reset_counter; + /* Tracking of per-engine reset requests. + * It must be synchronised with reset_counter by taking gpu_error.lock + * to ensure that the I915_RESET_IN_PROGRESS flag is always set if + * reset_requests is non-zero.*/ +#define I915_GLOBAL_RESET_REQUEST (1 << 31) + u32 reset_requests; + /** * Special values/flags for reset_counter * @@ -1146,6 +1153,9 @@ struct i915_gpu_error { * to global reset in case per-engine reset happens too frequently */ u32 score; + /* A count of global resets for debugfs stats */ + u32 global_resets; + /** * Waitqueue to signal when the reset has completed. Used by clients * that wait for dev_priv->mm.wedged to settle. @@ -1936,6 +1946,7 @@ extern int intel_gpu_reset(struct drm_device *dev); extern int intel_gpu_engine_reset(struct drm_device *dev, enum intel_ring_id engine); extern int i915_reset(struct drm_device *dev); +extern int i915_handle_hung_ring(struct intel_ring_buffer *ring); extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv); extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv); extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index a29b355..42a1647 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2351,7 +2351,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, struct drm_i915_gem_request, list); - if (request->seqno > completed_seqno) + if (!ring->hangcheck.status_updated + && (request->seqno > completed_seqno)) i915_set_reset_status(ring, request, acthd); i915_gem_free_request(request); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 0131423..f550b1e 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2206,63 +2206,117 @@ static void i915_error_work_func(struct work_struct *work) drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t, gpu_error); struct drm_device *dev = dev_priv->dev; + struct intel_ring_buffer *ring; char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; int ret; - - kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event); - + u32 reset_requests; + unsigned long irqflags; + int i; + /* * Note that there's only one work item which does gpu resets, so we * need not worry about concurrent gpu resets potentially incrementing * error->reset_counter twice. We only need to take care of another - * racing irq/hangcheck declaring the gpu dead for a second time. A - * quick check for that is good enough: schedule_work ensures the - * correct ordering between hang detection and this work item, and since - * the reset in-progress bit is only ever set by code outside of this - * work we don't need to worry about any other races. + * racing irq/hangcheck declaring another ring hung. + * The spinlock ensures synchronisation with i915_handle_error and + * makes sure that the reset_counter doesn't get incremented until *all* + * pending reset requests have been processed. */ - if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) { - DRM_DEBUG_DRIVER("resetting chip\n"); - kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, - reset_event); + spin_lock_irqsave(&error->lock, irqflags); + reset_requests = error->reset_requests; + error->reset_requests = 0; + spin_unlock_irqrestore(&error->lock, irqflags); - /* - * All state reset _must_ be completed before we update the - * reset counter, for otherwise waiters might miss the reset - * pending state and not properly drop locks, resulting in - * deadlocks with the reset work. - */ - ret = i915_reset(dev); + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event); - intel_display_handle_reset(dev); + if (reset_requests && !i915_terminally_wedged(error)) { + DRM_DEBUG_TDR("Processing reset requests: 0x%08x\n", + reset_requests); - if (ret == 0) { - /* - * After all the gem state is reset, increment the reset - * counter and wake up everyone waiting for the reset to - * complete. - * - * Since unlock operations are a one-sided barrier only, - * we need to insert a barrier here to order any seqno - * updates before - * the counter increment. - */ - smp_mb__before_atomic_inc(); - atomic_inc(&dev_priv->gpu_error.reset_counter); + /* Grab mutex whilst processing individual reset requests*/ + mutex_lock(&dev->struct_mutex); - kobject_uevent_env(&dev->primary->kdev->kobj, - KOBJ_CHANGE, reset_done_event); - } else { - atomic_set(&error->reset_counter, I915_WEDGED); + for_each_ring(ring, dev_priv, i) { + /* Reset the status update flag. It ensures that + * the status is only updated once for each ring per + * call to error_work_func even if we fall back to + * a global reset, + */ + ring->hangcheck.status_updated = 0; + + /* Global reset takes precedence */ + if (reset_requests & I915_GLOBAL_RESET_REQUEST) + continue; + + if (reset_requests & (0x1 << ring->id)) { + DRM_DEBUG_TDR("reset request for %s\n", + ring->name); + + if (i915_handle_hung_ring(ring) != 0) { + DRM_ERROR("%s reset failed\n", + ring->name); + + /* Try a global reset instead */ + reset_requests = + I915_GLOBAL_RESET_REQUEST; + continue; + } + } } - /* - * Note: The wake_up also serves as a memory barrier so that - * waiters see the update value of the reset counter atomic_t. + mutex_unlock(&dev->struct_mutex); + + if (reset_requests & I915_GLOBAL_RESET_REQUEST) { + DRM_DEBUG_DRIVER("resetting chip\n"); + kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, + reset_event); + + /* + * All state reset _must_ be completed before we update the + * reset counter, for otherwise waiters might miss the reset + * pending state and not properly drop locks, resulting in + * deadlocks with the reset work. + * After all the gem state is reset, increment the reset + * counter and wake up everyone waiting for the reset to + * complete. + */ + ret = i915_reset(dev); + + intel_display_handle_reset(dev); + + if (ret != 0) + atomic_set(&error->reset_counter, I915_WEDGED); + } + + /* Only increment reset counter if no new work has been added + * since we sampled reset_requests. If more work has been + * added then i915_error_work_func will be scheduled again so + * leave the in-progress flag set. */ - i915_error_wake_up(dev_priv, true); + if (!i915_terminally_wedged(error)) { + spin_lock_irqsave(&error->lock, irqflags); + if (error->reset_requests == 0) { + /* Since unlock operations are a one-sided barrier only, + * we need to insert a barrier here to order any seqno + * updates before the counter increment. + */ + smp_mb__before_atomic_inc(); + atomic_inc(&error->reset_counter); + } + spin_unlock_irqrestore(&error->lock, irqflags); + + /* + * Note: The wake_up also serves as a memory barrier so that + * waiters see the update value of the reset counter atomic_t. + */ + i915_error_wake_up(dev_priv, true); + } + + /* Send uevent to indicate that we have finished recovery work*/ + kobject_uevent_env(&dev->primary->kdev->kobj, + KOBJ_CHANGE, reset_done_event); } } @@ -2379,13 +2433,25 @@ static void i915_report_and_clear_eir(struct drm_device *dev) void i915_handle_error(struct drm_device *dev, u32 mask) { struct drm_i915_private *dev_priv = dev->dev_private; + unsigned long irqflags; i915_capture_error_state(dev); i915_report_and_clear_eir(dev); - if (mask > 0) { + if (mask) { + /* Once the error work function executes it will sample the + * current value of reset_requests and start processing them. + * If we detect another hang before it completes then we need + * to ensure that it doesn't clear the IN_PROGRESS flag + * prematurely as the rest of the driver relies on reset_counter + * alone. A spinlock is used to synchronise between reset_counter + * and reset_requests. + */ + spin_lock_irqsave(&dev_priv->gpu_error.lock, irqflags); + dev_priv->gpu_error.reset_requests = mask; atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG, - &dev_priv->gpu_error.reset_counter); + &dev_priv->gpu_error.reset_counter); + spin_unlock_irqrestore(&dev_priv->gpu_error.lock, irqflags); /* * Wakeup waiting processes so that the reset work function @@ -2405,9 +2471,13 @@ void i915_handle_error(struct drm_device *dev, u32 mask) /* * Our reset work can grab modeset locks (since it needs to reset the - * state of outstanding pagelips). Hence it must not be run on our own - * dev-priv->wq work queue for otherwise the flush_work in the pageflip - * code will deadlock. + * state of outstanding pageflips). Hence it must not be run on our own + * dev_priv->wq work queue for otherwise the flush_work in the pageflip + * code will deadlock. If error_work is already in the work queue then + * it will not be added again. If it isn't in the queue or it is + * currently executing then this call will add it the queue again so + * that even if it misses the reset flags during the current call it + * is guaranteed to see them on the next call. */ schedule_work(&dev_priv->gpu_error.work); } @@ -2921,12 +2991,12 @@ static void i915_hangcheck_elapsed(unsigned long data) if (INTEL_INFO(dev)->gen < 7) { /* Per-engine reset not supported */ dev_priv->gpu_error.score = 0; - hang_mask = GLOBAL_RESET_REQUEST; + hang_mask = I915_GLOBAL_RESET_REQUEST; } else if (dev_priv->gpu_error.score > FIRE) { /* Per-engine reset occuring too frequently. Fall back * to a global reset to try and resolve the hang */ dev_priv->gpu_error.score = 0; - hang_mask = GLOBAL_RESET_REQUEST; + hang_mask = I915_GLOBAL_RESET_REQUEST; DRM_DEBUG_TDR("Rings hanging too frequently\n"); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index b9125dc..7235720 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -66,7 +66,9 @@ struct intel_ring_hangcheck { int score; enum intel_ring_hangcheck_action action; u32 resets; /* Total resets applied to this ring/engine*/ + u32 total; /* Total resets applied to this ring/engine*/ u32 last_head; /* Head value recorded at last hang */ + u32 status_updated; }; struct intel_ring_buffer { diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index c51e389..8ab5385 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -975,6 +975,9 @@ static int gen6_do_reset(struct drm_device *dev) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500); + if (ret == 0) + dev_priv->gpu_error.global_resets++; + intel_uncore_forcewake_reset(dev); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); @@ -998,8 +1001,6 @@ static int gen6_do_engine_reset(struct drm_device *dev, */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); - ring->hangcheck.resets++; - /* Reset the engine. * GEN6_GDRST is not in the gt power well so no need to check * for fifo space for the write or forcewake the chip for @@ -1012,6 +1013,9 @@ static int gen6_do_engine_reset(struct drm_device *dev, ret = wait_for_atomic((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_RENDER) == 0, 500); + if (ret == 0) + ring->hangcheck.total++; + DRM_DEBUG_TDR("RCS Reset\n"); break; @@ -1022,6 +1026,9 @@ static int gen6_do_engine_reset(struct drm_device *dev, ret = wait_for_atomic((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_BLT) == 0, 500); + if (ret == 0) + ring->hangcheck.total++; + DRM_DEBUG_TDR("BCS Reset\n"); break; @@ -1032,6 +1039,9 @@ static int gen6_do_engine_reset(struct drm_device *dev, ret = wait_for_atomic((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_MEDIA) == 0, 500); + if (ret == 0) + ring->hangcheck.total++; + DRM_DEBUG_TDR("VCS Reset\n"); break; @@ -1041,6 +1051,9 @@ static int gen6_do_engine_reset(struct drm_device *dev, /* Spin waiting for the device to ack the reset request */ ret = wait_for_atomic((I915_READ_NOTRACE(GEN6_GDRST) & GEN6_GRDOM_VEBOX) == 0, 500); + if (ret == 0) + ring->hangcheck.total++; + DRM_DEBUG_TDR("VECS Reset\n"); break; @@ -1145,6 +1158,11 @@ int i915_handle_hung_ring(struct intel_ring_buffer *ring) i915_set_reset_status(ring, request, acthd); } + /* Flag that we have updated the stats for this ring in case + * a global reset comes in to prevent it from processing stats + * for this ring a second time */ + ring->hangcheck.status_updated = 1; + /* Check if the ring has hung on an MI_DISPLAY_FLIP command. * The pipe value will be stored in the HWS page if it has.*/ pipe = intel_read_status_page(ring, I915_GEM_PGFLIP_INDEX); -- 1.8.5.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx