The error state is purposefully racy as we expect it to be called at any time and so have avoided any locking whilst capturing the crash dump. However, with multi-engine GPUs and multiple CPUs, those races can manifest into OOPSes as we attempt to chase dangling pointers freed on other CPUs. Under discussion are lots of ways to slow down normal operation in order to protect the post-mortem error capture, but what it we take the opposite approach and freeze the machine whilst the error catpure runs (note the GPU may still running, but as long as we don't process any of the results the driver's bookkeeping will be static). Note that by of itself, this is not a complete fix. It also depends on the compiler barriers in list_add/list_del to prevent transversing the lists into the void. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/Kconfig | 1 + drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gpu_error.c | 35 +++++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 33e8563c2f99..17841b36b1df 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -6,6 +6,7 @@ config DRM_I915 select INTEL_GTT select AGP_INTEL if AGP select INTERVAL_TREE + select STOP_MACHINE # we need shmfs for the swappable backing store, and in particular # the shmem_readpage() which depends upon tmpfs select SHMEM diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f1447dbb3c55..89da35105a33 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -484,6 +484,8 @@ struct drm_i915_error_state { struct kref ref; struct timeval time; + struct drm_i915_private *i915; + char error_msg[128]; bool simulated; int iommu; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 98d0a6a53cc7..a3090d7ac20a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -28,6 +28,7 @@ */ #include <generated/utsrelease.h> +#include <linux/stop_machine.h> #include "i915_drv.h" static const char *ring_str(int ring) @@ -1262,6 +1263,26 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv, error->suspend_count = dev_priv->suspend_count; } +static int capture(void *data) +{ + struct drm_i915_error_state *error = data; + + i915_capture_gen_state(error->i915, error); + i915_capture_reg_state(error->i915, error); + i915_gem_record_fences(error->i915->dev, error); + i915_gem_record_rings(error->i915->dev, error); + + i915_capture_active_buffers(error->i915, error); + i915_capture_pinned_buffers(error->i915, error); + + do_gettimeofday(&error->time); + + error->overlay = intel_overlay_capture_error_state(error->i915->dev); + error->display = intel_display_capture_error_state(error->i915->dev); + + return 0; +} + /** * i915_capture_error_state - capture an error record for later analysis * @dev: drm device @@ -1290,19 +1311,9 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged, } kref_init(&error->ref); + error->i915 = dev_priv; - i915_capture_gen_state(dev_priv, error); - i915_capture_reg_state(dev_priv, error); - i915_gem_record_fences(dev, error); - i915_gem_record_rings(dev, error); - - i915_capture_active_buffers(dev_priv, error); - i915_capture_pinned_buffers(dev_priv, error); - - do_gettimeofday(&error->time); - - error->overlay = intel_overlay_capture_error_state(dev); - error->display = intel_display_capture_error_state(dev); + stop_machine(capture, error, NULL); i915_error_capture_msg(dev, error, wedged, error_msg); DRM_INFO("%s\n", error->error_msg); -- 2.7.0.rc3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx