Re: [RFC 08/13] drm/i915: TDR loose ends

Daniel Vetter <daniel@xxxxxxxx> · Mon, 16 Dec 2013 19:14:59 +0100

On Mon, Dec 16, 2013 at 04:03:18PM +0000, Lister, Ian wrote:
> From b6c143ee1f0b65053186be2602f876cc4fd944ab Mon Sep 17 00:00:00 2001
> Message-Id: <b6c143ee1f0b65053186be2602f876cc4fd944ab.1387201899.git.ian.lister@xxxxxxxxx>
> In-Reply-To: <cover.1387201899.git.ian.lister@xxxxxxxxx>
> References: <cover.1387201899.git.ian.lister@xxxxxxxxx>
> From: ian-lister <ian.lister@xxxxxxxxx>
> Date: Tue, 10 Dec 2013 15:01:55 +0000
> Subject: [RFC 08/13] drm/i915: TDR loose ends
> 
> Added check for HANGCHECK_IDLE in i915_set_reset_status as a request
> should not be guilty if the ring is recorded as idle.
> 
> Cancel the gpu_error.work item on calling i915_suspend.

This needs to be a separate patch with a nice commit message explaing what
it fixes exactly. Nice catch though ;-)

> Take gpu_error.lock when clearing wedged condition in entervt_ioctl.

This is only ever used for legacy userspace running in userspace
modesetting mode. This stopped being a thing from gen6 onwards. Do not
touch ;-)
> 
> Signed-off-by: ian-lister <ian.lister@xxxxxxxxx>

Generally please don't squash unrelated changes together. The reason for
that (besides that it helps with review) is that it makes bisecting more
powerful and reverting changes easier. More comments below.
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_dma.c         |  2 ++
>  drivers/gpu/drm/i915/i915_drv.c         | 46 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h         | 15 +++++++++++
>  drivers/gpu/drm/i915/i915_gem.c         | 14 +++++++---
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>  5 files changed, 74 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index e17a5d20..d45e936 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1661,6 +1661,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  		acpi_video_register();
>  	}
>  
> +	i915_init_watchdog(dev);
> +
>  	if (IS_GEN5(dev))
>  		intel_gpu_ips_init(dev_priv);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 38cd27d..42ccfee 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -154,6 +154,11 @@ module_param_named(prefault_disable, i915_prefault_disable, bool, 0600);
>  MODULE_PARM_DESC(prefault_disable,
>  		"Disable page prefaulting for pread/pwrite/reloc (default:false). For developers only.");
>  
> +int i915_enable_watchdog __read_mostly = 1;
> +module_param_named(i915_enable_watchdog, i915_enable_watchdog, int, 0644);
> +MODULE_PARM_DESC(i915_enable_watchdog,
> +			"Enable watchdog timers (default: true)");
> +
>  static struct drm_driver driver;
>  #if IS_ENABLED(CONFIG_AGP_INTEL)
>  extern int intel_agp_enabled;
> @@ -736,6 +741,47 @@ int i915_resume(struct drm_device *dev)
>  	return 0;
>  }
>  
> +void i915_init_watchdog(struct drm_device *dev)
> +{
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	struct intel_ring_buffer *ring;
> +	int freq;
> +	int i;
> +	u32 tout;
> +
> +	/* Based on pre-defined time out value (60ms or 30ms) calculate
> +	 * timer count thresholds needed based on core frequency.
> +	 *
> +	 * For RCS:
> +	 * The timestamp resolution changed in Gen7 and beyond to 80ns
> +	 * for all pipes. Before that it was 640ns.
> +	 */
> +	if (INTEL_INFO(dev)->gen >= 7)
> +		freq = KM_TIMESTAMP_CNTS_PER_SEC_80NS;
> +	else
> +		freq = KM_TIMESTAMP_CNTS_PER_SEC_640NS;
> +
> +	for_each_ring(ring, dev_priv, i) {
> +		switch (ring->id) {
> +		case RCS:
> +			tout = KM_MEDIA_ENGINE_TIMEOUT_VALUE_IN_MS;
> +			break;
> +		case VCS:
> +			tout = KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS;
> +			break;
> +		default:
> +			tout = 0;
> +			break;
> +		}
> +
> +		ring->hangcheck.watchdog_threshold =
> +			(tout * (freq / KM_TIMER_MILLISECOND));
> +
> +		DRM_DEBUG_TDR("%s watchdog threshold = %d\n",
> +			ring->name, ring->hangcheck.watchdog_threshold);
> +	}
> +}		
> +

I guess this should be part of the next patch which starts with the
watchdog support?

>  /**
>   * i915_reset - reset chip after a hang
>   * @dev: drm device to reset
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 1087e4e..6e379f1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1893,6 +1893,19 @@ struct drm_i915_file_private {
>  
>  #define GT_FREQUENCY_MULTIPLIER 50
>  
> +#define KM_MEDIA_ENGINE_TIMEOUT_VALUE_IN_MS 60
> +#define KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS 60
> +#define KM_TIMER_MILLISECOND 1000
> +
> +/* Timestamp timer resolution = 0.080 uSec, or 12500000 counts per second*/
> +#define KM_TIMESTAMP_CNTS_PER_SEC_80NS 12500000
> +
> +/* Timestamp timer resolution = 0.640 uSec, or 1562500 counts per second*/
> +#define KM_TIMESTAMP_CNTS_PER_SEC_640NS 1562500
> +
> +#define KM_TIMER_MHZ 1000000
> +#define KM_CD_CLK_FREQ (450 * KM_TIMER_MHZ)
> +
>  #include "i915_trace.h"
>  
>  extern const struct drm_ioctl_desc i915_ioctls[];
> @@ -1908,6 +1921,7 @@ extern int i915_vbt_sdvo_panel_type __read_mostly;
>  extern int i915_enable_rc6 __read_mostly;
>  extern int i915_enable_fbc __read_mostly;
>  extern bool i915_enable_hangcheck __read_mostly;
> +extern int i915_enable_watchdog __read_mostly;
>  extern int i915_enable_ppgtt __read_mostly;
>  extern int i915_enable_psr __read_mostly;
>  extern unsigned int i915_preliminary_hw_support __read_mostly;
> @@ -1942,6 +1956,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
>  extern int i915_emit_box(struct drm_device *dev,
>  			 struct drm_clip_rect *box,
>  			 int DR1, int DR4);
> +extern void i915_init_watchdog(struct drm_device *dev);
>  extern int intel_gpu_reset(struct drm_device *dev);
>  extern int intel_gpu_engine_reset(struct drm_device *dev,
>  				enum intel_ring_id engine);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5b76da4..a762444 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2251,7 +2251,7 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
>  		if (i915_head_inside_object(acthd, request->batch_obj,
>  					    request_to_vm(request))) {
>  			*inside = true;
> -			DRM_DEBUG_TDR("Head in obj 0x%08x\n",
> +			DRM_DEBUG_TDR("Head in obj %p\n",
>  				request->batch_obj);
>  			return true;
>  		}
> @@ -2259,7 +2259,7 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
>  
>  	if (i915_head_inside_request(acthd, request->head, request->tail)) {
>  		*inside = false;
> -		DRM_DEBUG_TDR("Head in req 0x%08x (HD: 0x%08x TL: 0x%08x)\n",
> +		DRM_DEBUG_TDR("Head in req %p (HD: 0x%08x TL: 0x%08x)\n",
>  			request, request->head, request->tail);
>  		return true;
>  	}
> @@ -2297,8 +2297,9 @@ void i915_set_reset_status(struct intel_ring_buffer *ring,
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> -	    i915_request_guilty(request, acthd, &inside)) {
> +	if (!((ring->hangcheck.action == HANGCHECK_WAIT)
> +		|| (ring->hangcheck.action == HANGCHECK_IDLE))

Actually this looks a bit broken already - a stuck WAIT should imo be
blamed on relevant userspace.

> +	&& i915_request_guilty(request, acthd, &inside)) {
>  		DRM_ERROR("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
>  			  inside ? "inside" : "flushing",
> @@ -4300,6 +4301,7 @@ i915_gem_suspend(struct drm_device *dev)
>  
>  	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
>  	atomic_set(&dev_priv->gpu_error.active, 0);
> +	cancel_work_sync(&dev_priv->gpu_error.work);
>  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
>  	cancel_delayed_work_sync(&dev_priv->mm.idle_work);
>  
> @@ -4527,6 +4529,7 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
>  		       struct drm_file *file_priv)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	unsigned long irqflags;
>  	int ret;
>  
>  	if (drm_core_check_feature(dev, DRIVER_MODESET))
> @@ -4534,7 +4537,10 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
>  
>  	if (i915_reset_in_progress(&dev_priv->gpu_error)) {
>  		DRM_ERROR("Reenabling wedged hardware, good luck\n");
> +		spin_lock_irqsave(&dev_priv->gpu_error.lock, irqflags);
> +		dev_priv->gpu_error.reset_requests = 0;
>  		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
> +		spin_unlock_irqrestore(&dev_priv->gpu_error.lock, irqflags);
>  	}
>  
>  	mutex_lock(&dev->struct_mutex);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 7235720..c3d6cbb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -69,6 +69,7 @@ struct intel_ring_hangcheck {
>  	u32 total; /* Total resets applied to this ring/engine*/
>  	u32 last_head; /* Head value recorded at last hang */
>  	u32 status_updated;
> +	u32 watchdog_threshold;
>  };
>  
>  struct  intel_ring_buffer {
> -- 
> 1.8.5.1
> 
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx