On Tue, Oct 10, 2017 at 3:39 PM, Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> wrote: > Style nits: > > Quoting Daniel Vetter (2017-10-09 17:44:01) >> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c >> index 82a10036fb38..8d7d8d1f78db 100644 >> --- a/drivers/gpu/drm/i915/i915_gem.c >> +++ b/drivers/gpu/drm/i915/i915_gem.c >> @@ -3057,49 +3057,71 @@ static void nop_submit_request(struct drm_i915_gem_request *request) >> >> spin_lock_irqsave(&request->engine->timeline->lock, flags); >> __i915_gem_request_submit(request); >> - intel_engine_init_global_seqno(request->engine, request->global_seqno); >> spin_unlock_irqrestore(&request->engine->timeline->lock, flags); > > This reduced to i915_gem_request_submit(). > > Hah, you can just engine->submit_request = i915_gem_request_submit, and > keep the nop_submit_request for the second phase! That may make the > diff neater? We need to call dma_fence_set_error still, and we need to start doing that before we start calling ->cancel_request, or we might miss some requests. I'll do all the other suggestions and resubmit. -Daniel >> -static int __i915_gem_set_wedged_BKL(void *data) >> +void i915_gem_set_wedged(struct drm_i915_private *i915) >> { >> - struct drm_i915_private *i915 = data; >> struct intel_engine_cs *engine; >> enum intel_engine_id id; >> >> + /* First, stop submission to hw, but do not yet complete requests by >> + * rolling the global seqno forward (since this would complete requests >> + * for which we haven't set the fence error to EIO yet). >> + */ > > I'm doing a quiet fixup of all my comments to follow > /* > * The core convention, which you normally use anyway. > */ > >> for_each_engine(engine, i915, id) >> - engine_set_wedged(engine); >> + engine->submit_request = nop_submit_request; >> >> - set_bit(I915_WEDGED, &i915->gpu_error.flags); >> - wake_up_all(&i915->gpu_error.reset_queue); >> + /* Make sure no one is running the old callback before we proceed with >> + * cancelling requests and resetting the completion tracking. Otherwise >> + * we might submit a request to the hardware which never completes. >> + */ >> + synchronize_rcu(); >> >> - return 0; >> -} >> + for_each_engine(engine, i915, id) { >> + /* Mark all executing requests as skipped */ >> + engine->cancel_requests(engine); >> >> -void i915_gem_set_wedged(struct drm_i915_private *dev_priv) >> -{ >> - stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL); >> + /* Only once we've force-cancelled all in-flight requests can we >> + * start to complete all requests. >> + */ >> + engine->submit_request = nop_complete_submit_request; >> + } >> + >> + /* Make sure no request can slip through without getting completed by >> + * either this call here to intel_engine_init_global_seqno, or the one >> + * in nop_complete_submit_request. >> + */ >> + synchronize_rcu(); >> + >> + for_each_engine(engine, i915, id) { >> + unsigned long flags; >> + >> + /* Mark all pending requests as complete so that any concurrent >> + * (lockless) lookup doesn't try and wait upon the request as we >> + * reset it. >> + */ >> + spin_lock_irqsave(&engine->timeline->lock, flags); >> + intel_engine_init_global_seqno(engine, >> + intel_engine_last_submit(engine)); >> + spin_unlock_irqrestore(&engine->timeline->lock, flags); >> + } >> + >> + set_bit(I915_WEDGED, &i915->gpu_error.flags); >> + wake_up_all(&i915->gpu_error.reset_queue); >> } >> >> bool i915_gem_unset_wedged(struct drm_i915_private *i915) >> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c >> index b100b38f1dd2..ef78a85cb845 100644 >> --- a/drivers/gpu/drm/i915/i915_gem_request.c >> +++ b/drivers/gpu/drm/i915/i915_gem_request.c >> @@ -556,7 +556,9 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) >> switch (state) { >> case FENCE_COMPLETE: >> trace_i915_gem_request_submit(request); >> + rcu_read_lock(); > > This trick needs a comment. > > /* > * We need to serialize use of the submit_request() callback with its > * hotplugging performed during an emergency i915_gem_set_wedged(). > * We use the RCU mechanism to mark the critical section in order to > * force i915_gem_set_wedged() to wait until the submit_request() is > * completed before proceeding. > */ > -Chris -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx