Re: [Intel-gfx] [PATCH 004/190] drm/i915: Fix some invalid requests cancellations

Dave Gordon <david.s.gordon@xxxxxxxxx> · Tue, 12 Jan 2016 18:16:21 +0000

On 11/01/16 09:16, Chris Wilson wrote:
As we add the VMA to the request early, it may be cancelled during
execbuf reservation. This will leave the context object pointing to a
dangling request; i915_wait_request() simply skips the wait and so we
may unbind the object whilst it is still active.

I don't understand this; context objects don't point to requests, it's 
vice versa. The request has a pointer to the engine, and to the context, 
and adds +1 to the refcount on the latter.

However, if at any point we make a change to the hardware (and equally
importantly our bookkeeping in the driver), we cannot cancel the request
as what has already been written must be submitted. Submitting a partial
request is far easier than trying to unwind the incomplete change.

What change could be made to the hardware? Engine reset, perhaps, but 
even that doesn't necessarily invalidate a request in preparation for 
sending to the hardware.

Submitting a partial change seems likely to leave the h/w in an 
undefined state. Cancelling a request is (or ought to be) trivial; just 
reset the ringbuffer's tail pointer to where it was when the request was 
allocated. The engine doesn't read anything past the tail of the 
previous request until the new request is submitted.

Unfortunately this patch undoes the excess breadcrumb usage that olr
prevented, e.g. if we interrupt batchbuffer submission then we submit
the requests along with the memory writes and interrupt (even though we
do no real work). Disassociating requests from breadcrumbs (and
semaphores) is a topic for a past/future series, but now much more
important.

Another incomprehensible comment? OLR went away a long time ago now. And 
AFAIK batchbuffer submission cannot be interrupted by anything (or more 
accurately, anything that interrupts submission won't be able to write 
to the ringbuffer or submit new work).

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Daniel Vetter <daniel.vetter@xxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
---
  drivers/gpu/drm/i915/i915_drv.h            |  1 -
  drivers/gpu/drm/i915/i915_gem.c            |  7 ++-----
  drivers/gpu/drm/i915/i915_gem_context.c    | 21 +++++++++------------
  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 16 +++++-----------
  drivers/gpu/drm/i915/intel_display.c       |  2 +-
  drivers/gpu/drm/i915/intel_lrc.c           |  1 -
  6 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 747d2d84a18c..ec20814adb0c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2813,7 +2813,6 @@ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
  			     struct drm_file *file_priv);
  void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
  					struct drm_i915_gem_request *req);
-void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
  int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
  				   struct drm_i915_gem_execbuffer2 *args,
  				   struct list_head *vmas);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3ab529669448..fd24877eb0a0 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3384,12 +3384,9 @@ int i915_gpu_idle(struct drm_device *dev)
  				return ret;

  			ret = i915_switch_context(req);
-			if (ret) {
-				i915_gem_request_cancel(req);
-				return ret;
-			}
-
  			i915_add_request_no_flush(req);
+			if (ret)
+				return ret;

This seems like a bad idea. Looking at how we could get here (i.e. how 
could switch_context() return an error), we see things such as "failed 
to pin" or i915_gem_object_get_pages() failed.

With no real idea of what the GPU and/or CPU address spaces contain at 
this point, it seems unwise to charge ahead regardless.

.Dave.

  		}

  		ret = intel_ring_idle(ring);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index c25083c78ba7..e5e9a8918f19 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -661,7 +661,6 @@ static int do_switch(struct drm_i915_gem_request *req)
  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
  	struct intel_context *from = ring->last_context;
  	u32 hw_flags = 0;
-	bool uninitialized = false;
  	int ret, i;

  	if (from != NULL && ring == &dev_priv->ring[RCS]) {
@@ -768,6 +767,15 @@ static int do_switch(struct drm_i915_gem_request *req)
  			to->remap_slice &= ~(1<<i);
  	}

+	if (!to->legacy_hw_ctx.initialized) {
+		if (ring->init_context) {
+			ret = ring->init_context(req);
+			if (ret)
+				goto unpin_out;
+		}
+		to->legacy_hw_ctx.initialized = true;
+	}
+
  	/* The backing object for the context is done after switching to the
  	 * *next* context. Therefore we cannot retire the previous context until
  	 * the next context has already started running. In fact, the below code
@@ -791,21 +799,10 @@ static int do_switch(struct drm_i915_gem_request *req)
  		i915_gem_context_unreference(from);
  	}

-	uninitialized = !to->legacy_hw_ctx.initialized;
-	to->legacy_hw_ctx.initialized = true;
-
  done:
  	i915_gem_context_reference(to);
  	ring->last_context = to;

-	if (uninitialized) {
-		if (ring->init_context) {
-			ret = ring->init_context(req);
-			if (ret)
-				DRM_ERROR("ring init context: %d\n", ret);
-		}
-	}
-
  	return 0;

  unpin_out:
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dccb517361b3..b8186bd061c1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1136,7 +1136,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
  	}
  }

-void
+static void
  i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params)
  {
  	/* Unconditionally force add_request to emit a full flush. */
@@ -1318,7 +1318,6 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);

  	i915_gem_execbuffer_move_to_active(vmas, params->request);
-	i915_gem_execbuffer_retire_commands(params);

  	return 0;
  }
@@ -1607,8 +1606,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
  		goto err_batch_unpin;

  	ret = i915_gem_request_add_to_client(params->request, file);
-	if (ret)
+	if (ret) {
+		i915_gem_request_cancel(params->request);
  		goto err_batch_unpin;
+	}

  	/*
  	 * Save assorted stuff away to pass through to *_submission().
@@ -1624,6 +1625,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
  	params->ctx                     = ctx;

  	ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
+	i915_gem_execbuffer_retire_commands(params);

  err_batch_unpin:
  	/*
@@ -1640,14 +1642,6 @@ err:
  	i915_gem_context_unreference(ctx);
  	eb_destroy(eb);

-	/*
-	 * If the request was created but not successfully submitted then it
-	 * must be freed again. If it was submitted then it is being tracked
-	 * on the active request list and no clean up is required here.
-	 */
-	if (ret && params->request)
-		i915_gem_request_cancel(params->request);
-
  	mutex_unlock(&dev->struct_mutex);

  pre_mutex_err:
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index b4cf9ce16155..959868c40018 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11751,7 +11751,7 @@ cleanup_unpin:
  	intel_unpin_fb_obj(fb, crtc->primary->state);
  cleanup_pending:
  	if (request)
-		i915_gem_request_cancel(request);
+		i915_add_request_no_flush(request);
  	atomic_dec(&intel_crtc->unpin_work_count);
  	mutex_unlock(&dev->struct_mutex);
  cleanup:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f7fac5f3b5ce..7f17ba852b8a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -972,7 +972,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
  	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);

  	i915_gem_execbuffer_move_to_active(vmas, params->request);
-	i915_gem_execbuffer_retire_commands(params);

  	return 0;
  }


--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html