On 03/19/2013 05:58 AM, Mika Kuoppala wrote: > Ian Romanick <idr at freedesktop.org> writes: > >> On 03/14/2013 08:52 AM, Mika Kuoppala wrote: >>> This ioctl returns context reset status for specified context. >>> >>> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com> >>> CC: idr at freedesktop.org >>> --- >>> drivers/gpu/drm/i915/i915_dma.c | 1 + >>> drivers/gpu/drm/i915/i915_drv.c | 61 +++++++++++++++++++++++++++++++++++++++ >>> drivers/gpu/drm/i915/i915_drv.h | 2 ++ >>> include/uapi/drm/i915_drm.h | 28 ++++++++++++++++++ >>> 4 files changed, 92 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c >>> index 7902d97..c919832 100644 >>> --- a/drivers/gpu/drm/i915/i915_dma.c >>> +++ b/drivers/gpu/drm/i915/i915_dma.c >>> @@ -1903,6 +1903,7 @@ struct drm_ioctl_desc i915_ioctls[] = { >>> DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE, i915_gem_context_create_ioctl, DRM_UNLOCKED), >>> DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_DESTROY, i915_gem_context_destroy_ioctl, DRM_UNLOCKED), >>> DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_UNLOCKED), >>> + DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATUS, i915_gem_context_get_reset_status_ioctl, DRM_UNLOCKED), >>> }; >>> >>> int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls); >>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c >>> index 69c9856..a4d06f2 100644 >>> --- a/drivers/gpu/drm/i915/i915_drv.c >>> +++ b/drivers/gpu/drm/i915/i915_drv.c >>> @@ -1267,3 +1267,64 @@ int i915_reg_read_ioctl(struct drm_device *dev, >>> >>> return 0; >>> } >>> + >>> +int i915_gem_context_get_reset_status_ioctl(struct drm_device *dev, >>> + void *data, struct drm_file *file) >>> +{ >>> + struct drm_i915_private *dev_priv = dev->dev_private; >>> + struct intel_ring_buffer *ring; >>> + struct drm_i915_reset_status *args = data; >>> + struct ctx_reset_state *rs = NULL; >>> + unsigned long reset_cnt; >>> + u32 reset_status = I915_RESET_UNKNOWN; >>> + int ret; >>> + >>> + ret = mutex_lock_interruptible(&dev->struct_mutex); >>> + if (ret) >>> + return ret; >>> + >>> + ring = &dev_priv->ring[RCS]; >>> + >>> + ret = i915_gem_context_get_reset_state(ring, >>> + file, >>> + args->ctx_id, >>> + &rs); >>> + if (ret) >>> + goto out; >>> + >>> + BUG_ON(!rs); >>> + >>> + reset_cnt = atomic_read(&dev_priv->gpu_error.reset_counter); >>> + >>> + if (reset_cnt & I915_RESET_IN_PROGRESS_FLAG || >> >> In this case, I believe we're supposed to return the reset state to the >> application. The ARB_robustness spec says: >> >> "If a reset status other than NO_ERROR is returned and subsequent >> calls return NO_ERROR, the context reset was encountered and >> completed. If a reset status is repeatedly returned, the context may >> be in the process of resetting." >> >> If the reset takes a long time, it seems that even a well-behaved app >> could run afoul of the 'banned' logic. > > As there reset status is initialized to I915_RESET_UNKNOWN, > we return it if the reset is in progress or gpu is wedged. Hmm... so user space will see I915_RESET_UNKNOWN until the reset is done, then it will (usually) see either I915_RESET_BATCH_ACTIVE or I915_RESET_BATCH_PENDING. I think that should be okay. >>> + reset_cnt == I915_WEDGED) { >>> + goto out; >>> + } >>> + >>> + /* Set guilty/innocent status if only one reset was >>> + * observed and if only one guilty was found >>> + */ >>> + if ((rs->reset_cnt + 2) == reset_cnt && >>> + (rs->guilty_cnt + 1) == dev_priv->gpu_error.guilty_cnt) { >> >> This logic seems... wrong, or at least weird. "rs->reset_cnt + 2" is >> confusing next to "if only one reset was observed". >> >> dev_priv->gpu_error.reset_counter is the global GPU reset count since >> start-up, and rs->reset_cnt is the global GPU count since start-up when >> the context was created. Right? > > Right. The confusing part in here is the > dev_priv->gpu_error.reset_counter. If it is odd, reset is in progress, > if it is even, the reset has been handled and all is well. That is why +2 That's a clever hack, I'm assuming, to use atomic operations instead of locks. Dear God that's awful to understand... it's a tiny bit more clear looking back at the 'reset_cnt & I915_RESET_IN_PROGRESS_FLAG'. Perhaps we could get some wrapper macros RESET_IN_PROGRESS() and RESET_ACTUAL_COUNT() or something? >> If that's the case, this will cause a context that was completely idle >> (i.e., didn't actually lose anything) to get a reset notification. >> That's an absolute deal breaker. > > This was then misunderstood by me. I will make it so that if you have > no batches submitted, you wont observe a reset. > >> If that's not the case, then this architecture needs a lot more >> documentation so that people new to it can understand what's happening. > > Agreed. If we don't need to care about the contexts where there > were no batches submitted, the logic will be simpler tho. > > -Mika > >>> + reset_status = 0; >>> + >>> + if (rs->guilty) >>> + reset_status |= I915_RESET_BATCH_ACTIVE; >>> + >>> + if (rs->innocent) >>> + reset_status |= I915_RESET_BATCH_PENDING; >>> + >>> + if (reset_status == 0) >>> + reset_status = I915_RESET_UNKNOWN; >>> + } else if (rs->reset_cnt == reset_cnt) { >>> + reset_status = I915_RESET_NO_ERROR; >>> + } >>> + >>> +out: >>> + if (!ret) >>> + args->reset_status = reset_status; >>> + >>> + mutex_unlock(&dev->struct_mutex); >>> + >>> + return ret ? -EINVAL : 0; >>> +} >>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h >>> index 3e11acf..2e5e8e7 100644 >>> --- a/drivers/gpu/drm/i915/i915_drv.h >>> +++ b/drivers/gpu/drm/i915/i915_drv.h >>> @@ -1712,6 +1712,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file); >>> int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file); >>> +int i915_gem_context_get_reset_status_ioctl(struct drm_device *dev, >>> + void *data, struct drm_file *file); >>> >>> /* i915_gem_gtt.c */ >>> void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev); >>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >>> index 07d5941..a195e0e 100644 >>> --- a/include/uapi/drm/i915_drm.h >>> +++ b/include/uapi/drm/i915_drm.h >>> @@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea { >>> #define DRM_I915_GEM_SET_CACHING 0x2f >>> #define DRM_I915_GEM_GET_CACHING 0x30 >>> #define DRM_I915_REG_READ 0x31 >>> +#define DRM_I915_GET_RESET_STATUS 0x32 >>> >>> #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) >>> #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) >>> @@ -247,6 +248,7 @@ typedef struct _drm_i915_sarea { >>> #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create) >>> #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy) >>> #define DRM_IOCTL_I915_REG_READ DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read) >>> +#define DRM_IOCTL_I915_GET_RESET_STATUS DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATUS, struct drm_i915_reset_status) >>> >>> /* Allow drivers to submit batchbuffers directly to hardware, relying >>> * on the security mechanisms provided by hardware. >>> @@ -980,4 +982,30 @@ struct drm_i915_reg_read { >>> __u64 offset; >>> __u64 val; /* Return value */ >>> }; >>> + >>> +/* No reset observed */ >>> +#define I915_RESET_NO_ERROR 0 >>> + >>> +/* Context had batch processing active while >>> + gpu hung and batch was guilty of gpu hang */ >>> +#define I915_RESET_BATCH_ACTIVE (1 << 0) >>> + >>> +/* Context had batch queued for processing while >>> + reset occurred and guilty batch was found: >>> + I915_RESET_BATCH_ACTIVE was set for this or >>> + some other context */ >>> +#define I915_RESET_BATCH_PENDING (1 << 1) >>> + >>> +/* Context observed gpu hung and reset but guilty context >>> + was not found: I915_RESET_BATCH_ACTIVE and >>> + I915_RESET_BATCH_PENDING were not set for any context */ >>> +#define I915_RESET_UNKNOWN (1 << 2) >>> + >>> +struct drm_i915_reset_status { >>> + __u32 ctx_id; >>> + __u32 flags; >>> + __u32 reset_status; >>> + __u32 pad; >>> +}; >>> + >>> #endif /* _UAPI_I915_DRM_H_ */ >>>