On 03/14/2013 08:52 AM, Mika Kuoppala wrote: > This ioctl returns context reset status for specified context. > > Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com> > CC: idr at freedesktop.org > --- > drivers/gpu/drm/i915/i915_dma.c | 1 + > drivers/gpu/drm/i915/i915_drv.c | 61 +++++++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 2 ++ > include/uapi/drm/i915_drm.h | 28 ++++++++++++++++++ > 4 files changed, 92 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c > index 7902d97..c919832 100644 > --- a/drivers/gpu/drm/i915/i915_dma.c > +++ b/drivers/gpu/drm/i915/i915_dma.c > @@ -1903,6 +1903,7 @@ struct drm_ioctl_desc i915_ioctls[] = { > DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE, i915_gem_context_create_ioctl, DRM_UNLOCKED), > DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_DESTROY, i915_gem_context_destroy_ioctl, DRM_UNLOCKED), > DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_UNLOCKED), > + DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATUS, i915_gem_context_get_reset_status_ioctl, DRM_UNLOCKED), > }; > > int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls); > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index 69c9856..a4d06f2 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1267,3 +1267,64 @@ int i915_reg_read_ioctl(struct drm_device *dev, > > return 0; > } > + > +int i915_gem_context_get_reset_status_ioctl(struct drm_device *dev, > + void *data, struct drm_file *file) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + struct intel_ring_buffer *ring; > + struct drm_i915_reset_status *args = data; > + struct ctx_reset_state *rs = NULL; > + unsigned long reset_cnt; > + u32 reset_status = I915_RESET_UNKNOWN; > + int ret; > + > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + > + ring = &dev_priv->ring[RCS]; > + > + ret = i915_gem_context_get_reset_state(ring, > + file, > + args->ctx_id, > + &rs); > + if (ret) > + goto out; > + > + BUG_ON(!rs); > + > + reset_cnt = atomic_read(&dev_priv->gpu_error.reset_counter); > + > + if (reset_cnt & I915_RESET_IN_PROGRESS_FLAG || In this case, I believe we're supposed to return the reset state to the application. The ARB_robustness spec says: "If a reset status other than NO_ERROR is returned and subsequent calls return NO_ERROR, the context reset was encountered and completed. If a reset status is repeatedly returned, the context may be in the process of resetting." If the reset takes a long time, it seems that even a well-behaved app could run afoul of the 'banned' logic. > + reset_cnt == I915_WEDGED) { > + goto out; > + } > + > + /* Set guilty/innocent status if only one reset was > + * observed and if only one guilty was found > + */ > + if ((rs->reset_cnt + 2) == reset_cnt && > + (rs->guilty_cnt + 1) == dev_priv->gpu_error.guilty_cnt) { This logic seems... wrong, or at least weird. "rs->reset_cnt + 2" is confusing next to "if only one reset was observed". dev_priv->gpu_error.reset_counter is the global GPU reset count since start-up, and rs->reset_cnt is the global GPU count since start-up when the context was created. Right? If that's the case, this will cause a context that was completely idle (i.e., didn't actually lose anything) to get a reset notification. That's an absolute deal breaker. If that's not the case, then this architecture needs a lot more documentation so that people new to it can understand what's happening. > + reset_status = 0; > + > + if (rs->guilty) > + reset_status |= I915_RESET_BATCH_ACTIVE; > + > + if (rs->innocent) > + reset_status |= I915_RESET_BATCH_PENDING; > + > + if (reset_status == 0) > + reset_status = I915_RESET_UNKNOWN; > + } else if (rs->reset_cnt == reset_cnt) { > + reset_status = I915_RESET_NO_ERROR; > + } > + > +out: > + if (!ret) > + args->reset_status = reset_status; > + > + mutex_unlock(&dev->struct_mutex); > + > + return ret ? -EINVAL : 0; > +} > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 3e11acf..2e5e8e7 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1712,6 +1712,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, > struct drm_file *file); > int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, > struct drm_file *file); > +int i915_gem_context_get_reset_status_ioctl(struct drm_device *dev, > + void *data, struct drm_file *file); > > /* i915_gem_gtt.c */ > void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev); > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 07d5941..a195e0e 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea { > #define DRM_I915_GEM_SET_CACHING 0x2f > #define DRM_I915_GEM_GET_CACHING 0x30 > #define DRM_I915_REG_READ 0x31 > +#define DRM_I915_GET_RESET_STATUS 0x32 > > #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) > #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) > @@ -247,6 +248,7 @@ typedef struct _drm_i915_sarea { > #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create) > #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy) > #define DRM_IOCTL_I915_REG_READ DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read) > +#define DRM_IOCTL_I915_GET_RESET_STATUS DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATUS, struct drm_i915_reset_status) > > /* Allow drivers to submit batchbuffers directly to hardware, relying > * on the security mechanisms provided by hardware. > @@ -980,4 +982,30 @@ struct drm_i915_reg_read { > __u64 offset; > __u64 val; /* Return value */ > }; > + > +/* No reset observed */ > +#define I915_RESET_NO_ERROR 0 > + > +/* Context had batch processing active while > + gpu hung and batch was guilty of gpu hang */ > +#define I915_RESET_BATCH_ACTIVE (1 << 0) > + > +/* Context had batch queued for processing while > + reset occurred and guilty batch was found: > + I915_RESET_BATCH_ACTIVE was set for this or > + some other context */ > +#define I915_RESET_BATCH_PENDING (1 << 1) > + > +/* Context observed gpu hung and reset but guilty context > + was not found: I915_RESET_BATCH_ACTIVE and > + I915_RESET_BATCH_PENDING were not set for any context */ > +#define I915_RESET_UNKNOWN (1 << 2) > + > +struct drm_i915_reset_status { > + __u32 ctx_id; > + __u32 flags; > + __u32 reset_status; > + __u32 pad; > +}; > + > #endif /* _UAPI_I915_DRM_H_ */ >