Re: [CI 1/5] drm/i915: Allow disabling error capture

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 11 Oct 2016, Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> wrote:
> On Tue, Oct 11, 2016 at 01:16:42PM +0300, Jani Nikula wrote:
>> On Tue, 11 Oct 2016, Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> wrote:
>> > On Tue, Oct 11, 2016 at 12:52:00PM +0300, Jani Nikula wrote:
>> >> On Tue, 11 Oct 2016, Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> wrote:
>> >> > We currently capture the GPU state after we detect a hang. This is vital
>> >> > for us to both triage and debug hangs in the wild (post-mortem
>> >> > debugging). However, it comes at the cost of running some potentially
>> >> > dangerous code (since it has to make very few assumption about the state
>> >> > of the driver) that is quite resource intensive.
>> >> >
>> >> > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
>> >> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx>
>> >> > ---
>> >> >  drivers/gpu/drm/i915/Kconfig          | 10 ++++++++++
>> >> >  drivers/gpu/drm/i915/i915_debugfs.c   |  6 ++++++
>> >> >  drivers/gpu/drm/i915/i915_drv.h       | 16 ++++++++++++++++
>> >> >  drivers/gpu/drm/i915/i915_gpu_error.c |  7 +++++++
>> >> >  drivers/gpu/drm/i915/i915_params.c    |  9 +++++++++
>> >> >  drivers/gpu/drm/i915/i915_params.h    |  1 +
>> >> >  drivers/gpu/drm/i915/i915_sysfs.c     |  8 ++++++++
>> >> >  drivers/gpu/drm/i915/intel_display.c  |  4 ++++
>> >> >  drivers/gpu/drm/i915/intel_overlay.c  |  4 ++++
>> >> >  9 files changed, 65 insertions(+)
>> >> >
>> >> > diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>> >> > index 7769e469118f..10a6ac11b6a9 100644
>> >> > --- a/drivers/gpu/drm/i915/Kconfig
>> >> > +++ b/drivers/gpu/drm/i915/Kconfig
>> >> > @@ -46,6 +46,16 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
>> >> >  
>> >> >  	  If in doubt, say "N".
>> >> >  
>> >> > +config DRM_I915_CAPTURE_ERROR
>> >> > +	bool "Enable capturing GPU state following a hang"
>> >> > +	depends on DRM_I915
>> >> > +	default y
>> >> > +	help
>> >> > +	  This option enables capturing the GPU state when a hang is detected.
>> >> > +	  This information is vital for triaging hangs and assists in debugging.
>> >> > +
>> >> > +	  If in doubt, say "Y".
>> >> > +
>> >> >  config DRM_I915_USERPTR
>> >> >  	bool "Always enable userptr support"
>> >> >  	depends on DRM_I915
>> >> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > index 20689f1cd719..e4b5ba771bea 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > @@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
>> >> >  	return 0;
>> >> >  }
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  static ssize_t
>> >> >  i915_error_state_write(struct file *filp,
>> >> >  		       const char __user *ubuf,
>> >> > @@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = {
>> >> >  	.release = i915_error_state_release,
>> >> >  };
>> >> >  
>> >> > +#endif
>> >> > +
>> >> >  static int
>> >> >  i915_next_seqno_get(void *data, u64 *val)
>> >> >  {
>> >> > @@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files {
>> >> >  	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
>> >> >  	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>> >> >  	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> >  	{"i915_error_state", &i915_error_state_fops},
>> >> > +#endif
>> >> >  	{"i915_next_seqno", &i915_next_seqno_fops},
>> >> >  	{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
>> >> >  	{"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
>> >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> >> > index 54d860e1c0fc..4570c4fa0287 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> >> > @@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
>> >> >  #endif
>> >> >  
>> >> >  /* i915_gpu_error.c */
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  __printf(2, 3)
>> >> >  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
>> >> >  int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
>> >> > @@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev,
>> >> >  void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
>> >> >  void i915_destroy_error_state(struct drm_device *dev);
>> >> >  
>> >> > +#else
>> >> > +
>> >> > +static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
>> >> > +					    u32 engine_mask,
>> >> > +					    const char *error_msg)
>> >> > +{
>> >> > +}
>> >> > +
>> >> > +static inline void i915_destroy_error_state(struct drm_device *dev)
>> >> > +{
>> >> > +}
>> >> > +
>> >> > +#endif
>> >> > +
>> >> >  void i915_get_engine_instdone(struct drm_i915_private *dev_priv,
>> >> >  			      enum intel_engine_id engine_id,
>> >> >  			      struct intel_instdone *instdone);
>> >> > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > index b5b58692ac5a..9b395ffa3b6a 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > @@ -30,6 +30,8 @@
>> >> >  #include <generated/utsrelease.h>
>> >> >  #include "i915_drv.h"
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  static const char *engine_str(int engine)
>> >> >  {
>> >> >  	switch (engine) {
>> >> > @@ -1464,6 +1466,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
>> >> >  	struct drm_i915_error_state *error;
>> >> >  	unsigned long flags;
>> >> >  
>> >> > +	if (!i915.error_capture)
>> >> > +		return;
>> >> > +
>> >> >  	if (READ_ONCE(dev_priv->gpu_error.first_error))
>> >> >  		return;
>> >> >  
>> >> > @@ -1549,6 +1554,8 @@ void i915_destroy_error_state(struct drm_device *dev)
>> >> >  		kref_put(&error->ref, i915_error_state_free);
>> >> >  }
>> >> >  
>> >> > +#endif
>> >> > +
>> >> >  const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
>> >> >  {
>> >> >  	switch (type) {
>> >> > diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
>> >> > index 768ad89d9cd4..e72a41223535 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_params.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_params.c
>> >> > @@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
>> >> >  	.load_detect_test = 0,
>> >> >  	.force_reset_modeset_test = 0,
>> >> >  	.reset = true,
>> >> > +	.error_capture = true,
>> >> >  	.invert_brightness = 0,
>> >> >  	.disable_display = 0,
>> >> >  	.enable_cmd_parser = 1,
>> >> > @@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
>> >> >  module_param_named_unsafe(reset, i915.reset, bool, 0600);
>> >> >  MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +module_param_named(error_capture, i915.error_capture, bool, 0600);
>> >> > +MODULE_PARM_DESC(error_capture,
>> >> > +	"Record the GPU state following a hang. "
>> >> > +	"This information in /sys/class/drm/card<N>/error is vital for "
>> >> > +	"triaging and debugging hangs.");
>> >> > +#endif
>> >> 
>> >> The addition of the module parameter is really orthogonal to the config
>> >> option, and should be added separately.
>> >> 
>> >> The question is, what is the target audience of
>> >> CONFIG_DRM_I915_CAPTURE_ERROR=n? Is the added config #ifdeffery worth
>> >> the trouble? It *is* a maintainability burden, because we will only ever
>> >> be building with CONFIG_DRM_I915_CAPTURE_ERROR=y ourselves, and we'll
>> >> screw it up anyway. Does i915.error_capture=0 accomplish the same thing
>> >> (apart from not reducing code size)? Do we really need to be able to
>> >> prevent the root from enabling error capture on the fly?
>> >
>> > What I had in mind was that RT kernels will probably default to turning
>> > off this facility (since it will cause latencies in the hundreds of ms
>> > range).
>> >  
>> >> What would be wrong with just adding the module parameter, and making
>> >> CONFIG_DRM_I915_CAPTURE_ERROR the default value for it? I.e.
>> >> 
>> >> 	.error_capture = IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR),
>> >> 
>> >> in the initialization. No #ifdefs anywhere.
>> >
>> > I wanted to turn it off by default at compilation time.
>> 
>> What I suggest above does that. ;)
>
> It's 30KiB of code that they don't want. I know that's only about 3% of
> the size of the driver, but it's a help.

Fair enough. Please copy-paste some of the elaboration to the commit
message. Ack from me, but it wouldn't hurt to get an ack from Daniel as
well.

BR,
Jani.

-- 
Jani Nikula, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux