Migrate the request operations out of the main body of i915_gem.c and into their own C file for easier expansion. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/i915_drv.h | 186 +----------- drivers/gpu/drm/i915/i915_gem.c | 502 +------------------------------ drivers/gpu/drm/i915/i915_gem_request.c | 513 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_request.h | 209 +++++++++++++ 5 files changed, 732 insertions(+), 679 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_gem_request.c create mode 100644 drivers/gpu/drm/i915/i915_gem_request.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index d3b9d3618719..3df88be4752e 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -29,6 +29,7 @@ i915-y += i915_cmd_parser.o \ i915_gem_gtt.o \ i915_gem.o \ i915_gem_render_state.o \ + i915_gem_request.o \ i915_gem_shrinker.o \ i915_gem_stolen.o \ i915_gem_tiling.o \ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c511b3cbf9b2..9ffb4b7eebd9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -39,6 +39,7 @@ #include "intel_lrc.h" #include "i915_gem_gtt.h" #include "i915_gem_render_state.h" +#include "i915_gem_request.h" #include <linux/io-mapping.h> #include <linux/i2c.h> #include <linux/i2c-algo-bit.h> @@ -2154,170 +2155,6 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old, struct drm_i915_gem_object *new, unsigned frontbuffer_bits); -/** - * Request queue structure. - * - * The request queue allows us to note sequence numbers that have been emitted - * and may be associated with active buffers to be retired. - * - * By keeping this list, we can avoid having to do questionable sequence - * number comparisons on buffer last_read|write_seqno. It also allows an - * emission time to be associated with the request for tracking how far ahead - * of the GPU the submission is. - * - * The requests are reference counted, so upon creation they should have an - * initial reference taken using kref_init - */ -struct drm_i915_gem_request { - struct kref ref; - - /** On Which ring this request was generated */ - struct drm_i915_private *i915; - struct intel_engine_cs *ring; - unsigned reset_counter; - - /** GEM sequence number associated with the previous request, - * when the HWS breadcrumb is equal to this the GPU is processing - * this request. - */ - u32 previous_seqno; - - /** GEM sequence number associated with this request, - * when the HWS breadcrumb is equal or greater than this the GPU - * has finished processing this request. - */ - u32 seqno; - - /** Position in the ringbuffer of the start of the request */ - u32 head; - - /** - * Position in the ringbuffer of the start of the postfix. - * This is required to calculate the maximum available ringbuffer - * space without overwriting the postfix. - */ - u32 postfix; - - /** Position in the ringbuffer of the end of the whole request */ - u32 tail; - - /** - * Context and ring buffer related to this request - * Contexts are refcounted, so when this request is associated with a - * context, we must increment the context's refcount, to guarantee that - * it persists while any request is linked to it. Requests themselves - * are also refcounted, so the request will only be freed when the last - * reference to it is dismissed, and the code in - * i915_gem_request_free() will then decrement the refcount on the - * context. - */ - struct intel_context *ctx; - struct intel_ringbuffer *ringbuf; - - /** Batch buffer related to this request if any (used for - error state dump only) */ - struct drm_i915_gem_object *batch_obj; - - /** Time at which this request was emitted, in jiffies. */ - unsigned long emitted_jiffies; - - /** global list entry for this request */ - struct list_head list; - - struct drm_i915_file_private *file_priv; - /** file_priv list entry for this request */ - struct list_head client_list; - - /** process identifier submitting this request */ - struct pid *pid; - - /** - * The ELSP only accepts two elements at a time, so we queue - * context/tail pairs on a given queue (ring->execlist_queue) until the - * hardware is available. The queue serves a double purpose: we also use - * it to keep track of the up to 2 contexts currently in the hardware - * (usually one in execution and the other queued up by the GPU): We - * only remove elements from the head of the queue when the hardware - * informs us that an element has been completed. - * - * All accesses to the queue are mediated by a spinlock - * (ring->execlist_lock). - */ - - /** Execlist link in the submission queue.*/ - struct list_head execlist_link; - - /** Execlists no. of times this request has been sent to the ELSP */ - int elsp_submitted; - -}; - -int i915_gem_request_alloc(struct intel_engine_cs *ring, - struct intel_context *ctx, - struct drm_i915_gem_request **req_out); -void i915_gem_request_cancel(struct drm_i915_gem_request *req); -void i915_gem_request_free(struct kref *req_ref); -int i915_gem_request_add_to_client(struct drm_i915_gem_request *req, - struct drm_file *file); - -static inline uint32_t -i915_gem_request_get_seqno(struct drm_i915_gem_request *req) -{ - return req ? req->seqno : 0; -} - -static inline struct intel_engine_cs * -i915_gem_request_get_ring(struct drm_i915_gem_request *req) -{ - return req ? req->ring : NULL; -} - -static inline struct drm_i915_gem_request * -i915_gem_request_reference(struct drm_i915_gem_request *req) -{ - if (req) - kref_get(&req->ref); - return req; -} - -static inline void -i915_gem_request_unreference(struct drm_i915_gem_request *req) -{ - WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex)); - kref_put(&req->ref, i915_gem_request_free); -} - -static inline void -i915_gem_request_unreference__unlocked(struct drm_i915_gem_request *req) -{ - struct drm_device *dev; - - if (!req) - return; - - dev = req->ring->dev; - if (kref_put_mutex(&req->ref, i915_gem_request_free, &dev->struct_mutex)) - mutex_unlock(&dev->struct_mutex); -} - -static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst, - struct drm_i915_gem_request *src) -{ - if (src) - i915_gem_request_reference(src); - - if (*pdst) - i915_gem_request_unreference(*pdst); - - *pdst = src; -} - -/* - * XXX: i915_gem_request_completed should be here but currently needs the - * definition of i915_seqno_passed() which is below. It will be moved in - * a later patch when the call to i915_seqno_passed() is obsoleted... - */ - /* * A command that requires special handling by the command parser. */ @@ -2966,27 +2803,6 @@ int i915_gem_dumb_create(struct drm_file *file_priv, struct drm_mode_create_dumb *args); int i915_gem_mmap_gtt(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset); -/** - * Returns true if seq1 is later than seq2. - */ -static inline bool -i915_seqno_passed(uint32_t seq1, uint32_t seq2) -{ - return (int32_t)(seq1 - seq2) >= 0; -} - -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req) -{ - return i915_seqno_passed(intel_ring_get_seqno(req->ring), - req->previous_seqno); -} - -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req) -{ - return i915_seqno_passed(intel_ring_get_seqno(req->ring), - req->seqno); -} - int __must_check i915_gem_get_seqno(struct drm_device *dev, u32 *seqno); int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 2e87639ecd1e..a23dea27f272 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1105,389 +1105,6 @@ put_rpm: return ret; } -static int -i915_gem_check_wedge(unsigned reset_counter, bool interruptible) -{ - if (__i915_terminally_wedged(reset_counter)) - return -EIO; - - if (__i915_reset_in_progress(reset_counter)) { - /* Non-interruptible callers can't handle -EAGAIN, hence return - * -EIO unconditionally for these. */ - if (!interruptible) - return -EIO; - - return -EAGAIN; - } - - return 0; -} - -static unsigned long local_clock_us(unsigned *cpu) -{ - unsigned long t; - - /* Cheaply and approximately convert from nanoseconds to microseconds. - * The result and subsequent calculations are also defined in the same - * approximate microseconds units. The principal source of timing - * error here is from the simple truncation. - * - * Note that local_clock() is only defined wrt to the current CPU; - * the comparisons are no longer valid if we switch CPUs. Instead of - * blocking preemption for the entire busywait, we can detect the CPU - * switch and use that as indicator of system load and a reason to - * stop busywaiting, see busywait_stop(). - */ - *cpu = get_cpu(); - t = local_clock() >> 10; - put_cpu(); - - return t; -} - -static bool busywait_stop(unsigned long timeout, unsigned cpu) -{ - unsigned this_cpu; - - if (time_after(local_clock_us(&this_cpu), timeout)) - return true; - - return this_cpu != cpu; -} - -static bool __i915_spin_request(struct drm_i915_gem_request *req, - struct intel_wait *wait, - int state) -{ - unsigned long timeout; - unsigned cpu; - - /* When waiting for high frequency requests, e.g. during synchronous - * rendering split between the CPU and GPU, the finite amount of time - * required to set up the irq and wait upon it limits the response - * rate. By busywaiting on the request completion for a short while we - * can service the high frequency waits as quick as possible. However, - * if it is a slow request, we want to sleep as quickly as possible. - * The tradeoff between waiting and sleeping is roughly the time it - * takes to sleep on a request, on the order of a microsecond. - */ - - /* Only spin if we know the GPU is processing this request */ - if (!i915_gem_request_started(req)) - return false; - - timeout = local_clock_us(&cpu) + 5; - do { - if (i915_gem_request_completed(req)) - return true; - - if (signal_pending_state(state, wait->task)) - break; - - if (busywait_stop(timeout, cpu)) - break; - - cpu_relax_lowlatency(); - - /* Break the loop if we have consumed the timeslice (or been - * preempted) or when either the background thread has - * enabled the interrupt, or the IRQ has fired. - */ - } while (!need_resched() && wait->task->state == state); - - return false; -} - -/** - * __i915_wait_request - wait until execution of request has finished - * @req: duh! - * @interruptible: do an interruptible wait (normally yes) - * @timeout: in - how long to wait (NULL forever); out - how much time remaining - * - * Note: It is of utmost importance that the passed in seqno and reset_counter - * values have been read by the caller in an smp safe manner. Where read-side - * locks are involved, it is sufficient to read the reset_counter before - * unlocking the lock that protects the seqno. For lockless tricks, the - * reset_counter _must_ be read before, and an appropriate smp_rmb must be - * inserted. - * - * Returns 0 if the request was found within the alloted time. Else returns the - * errno with remaining time filled in timeout argument. - */ -int __i915_wait_request(struct drm_i915_gem_request *req, - bool interruptible, - s64 *timeout, - struct intel_rps_client *rps) -{ - int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; - struct intel_wait wait; - unsigned long timeout_remain; - int ret = 0; - - might_sleep(); - - if (list_empty(&req->list)) - return 0; - - if (i915_gem_request_completed(req)) - return 0; - - timeout_remain = MAX_SCHEDULE_TIMEOUT; - if (timeout) { - if (WARN_ON(*timeout < 0)) - return -EINVAL; - - if (*timeout == 0) - return -ETIME; - - /* Record current time in case interrupted, or wedged */ - timeout_remain = nsecs_to_jiffies_timeout(*timeout); - *timeout += ktime_get_raw_ns(); - } - - trace_i915_gem_request_wait_begin(req); - - /* This client is about to stall waiting for the GPU. In many cases - * this is undesirable and limits the throughput of the system, as - * many clients cannot continue processing user input/output whilst - * blocked. RPS autotuning may take tens of milliseconds to respond - * to the GPU load and thus incurs additional latency for the client. - * We can circumvent that by promoting the GPU frequency to maximum - * before we wait. This makes the GPU throttle up much more quickly - * (good for benchmarks and user experience, e.g. window animations), - * but at a cost of spending more power processing the workload - * (bad for battery). Not all clients even want their results - * immediately and for them we should just let the GPU select its own - * frequency to maximise efficiency. To prevent a single client from - * forcing the clocks too high for the whole system, we only allow - * each client to waitboost once in a busy period. - */ - if (INTEL_INFO(req->i915)->gen >= 6) - gen6_rps_boost(req->i915, rps, req->emitted_jiffies); - - intel_wait_init(&wait, req->seqno); - set_task_state(wait.task, state); - - /* Optimistic spin for the next ~jiffie before touching IRQs */ - if (intel_engine_add_wait(req->ring, &wait)) { - if (__i915_spin_request(req, &wait, state)) - goto out; - - intel_engine_enable_irq(req->ring); - - /* In order to check that we haven't missed the interrupt - * as we enabled it, we need to kick ourselves to do a - * coherent check on the seqno before we sleep. - */ - goto wakeup; - } - - for (;;) { - if (signal_pending_state(state, wait.task)) { - ret = -ERESTARTSYS; - break; - } - - /* Ensure that even if the GPU hangs, we get woken up. */ - i915_queue_hangcheck(req->i915); - - timeout_remain = io_schedule_timeout(timeout_remain); - if (timeout_remain == 0) { - ret = -ETIME; - break; - } - - if (intel_wait_complete(&wait)) - break; - -wakeup: set_task_state(wait.task, state); - - /* Before we do the heavier coherent read of the seqno, - * check the value (hopefully) in the CPU cacheline. - */ - if (i915_gem_request_completed(req)) - break; - - /* Ensure our read of the seqno is coherent so that we - * do not "miss an interrupt" (i.e. if this is the last - * request and the seqno write from the GPU is not visible - * by the time the interrupt fires, we will see that the - * request is incomplete and go back to sleep awaiting - * another interrupt that will never come.) - * - * Strictly, we only need to do this once after an interrupt, - * but it is easier and safer to do it every time the waiter - * is woken. - */ - if (req->ring->seqno_barrier) { - req->ring->seqno_barrier(req->ring); - if (i915_gem_request_completed(req)) - break; - } - - /* We need to check whether any gpu reset happened in between - * the request being submitted and now. If a reset has occurred, - * the request is effectively complete (we either are in the - * process of or have discarded the rendering and completely - * reset the GPU. The results of the request are lost and we - * are free to continue on with the original operation. - */ - if (req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) - break; - } -out: - intel_engine_remove_wait(req->ring, &wait); - __set_task_state(wait.task, TASK_RUNNING); - trace_i915_gem_request_wait_end(req); - - if (timeout) { - *timeout -= ktime_get_raw_ns(); - if (*timeout < 0) - *timeout = 0; - - /* - * Apparently ktime isn't accurate enough and occasionally has a - * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch - * things up to make the test happy. We allow up to 1 jiffy. - * - * This is a regrssion from the timespec->ktime conversion. - */ - if (ret == -ETIME && *timeout < jiffies_to_usecs(1)*1000) - *timeout = 0; - } - - if (ret == 0 && rps && req->seqno == req->ring->last_submitted_seqno) { - /* The GPU is now idle and this client has stalled. - * Since no other client has submitted a request in the - * meantime, assume that this client is the only one - * supplying work to the GPU but is unable to keep that - * work supplied because it is waiting. Since the GPU is - * then never kept fully busy, RPS autoclocking will - * keep the clocks relatively low, causing further delays. - * Compensate by giving the synchronous client credit for - * a waitboost next time. - */ - spin_lock(&req->i915->rps.client_lock); - list_del_init(&rps->link); - spin_unlock(&req->i915->rps.client_lock); - } - - return ret; -} - -int i915_gem_request_add_to_client(struct drm_i915_gem_request *req, - struct drm_file *file) -{ - struct drm_i915_private *dev_private; - struct drm_i915_file_private *file_priv; - - WARN_ON(!req || !file || req->file_priv); - - if (!req || !file) - return -EINVAL; - - if (req->file_priv) - return -EINVAL; - - dev_private = req->ring->dev->dev_private; - file_priv = file->driver_priv; - - spin_lock(&file_priv->mm.lock); - req->file_priv = file_priv; - list_add_tail(&req->client_list, &file_priv->mm.request_list); - spin_unlock(&file_priv->mm.lock); - - req->pid = get_pid(task_pid(current)); - - return 0; -} - -static inline void -i915_gem_request_remove_from_client(struct drm_i915_gem_request *request) -{ - struct drm_i915_file_private *file_priv = request->file_priv; - - if (!file_priv) - return; - - spin_lock(&file_priv->mm.lock); - list_del(&request->client_list); - request->file_priv = NULL; - spin_unlock(&file_priv->mm.lock); - - put_pid(request->pid); - request->pid = NULL; -} - -static void i915_gem_request_retire(struct drm_i915_gem_request *request) -{ - trace_i915_gem_request_retire(request); - - /* We know the GPU must have read the request to have - * sent us the seqno + interrupt, so use the position - * of tail of the request to update the last known position - * of the GPU head. - * - * Note this requires that we are always called in request - * completion order. - */ - request->ringbuf->last_retired_head = request->postfix; - - list_del_init(&request->list); - i915_gem_request_remove_from_client(request); - - i915_gem_request_unreference(request); -} - -static void -__i915_gem_request_retire__upto(struct drm_i915_gem_request *req) -{ - struct intel_engine_cs *engine = req->ring; - struct drm_i915_gem_request *tmp; - - lockdep_assert_held(&engine->dev->struct_mutex); - - if (list_empty(&req->list)) - return; - - do { - tmp = list_first_entry(&engine->request_list, - typeof(*tmp), list); - - i915_gem_request_retire(tmp); - } while (tmp != req); - - WARN_ON(i915_verify_lists(engine->dev)); -} - -/** - * Waits for a request to be signaled, and cleans up the - * request and object lists appropriately for that event. - */ -int -i915_wait_request(struct drm_i915_gem_request *req) -{ - struct drm_device *dev; - struct drm_i915_private *dev_priv; - bool interruptible; - int ret; - - BUG_ON(req == NULL); - - dev = req->ring->dev; - dev_priv = dev->dev_private; - interruptible = dev_priv->mm.interruptible; - - BUG_ON(!mutex_is_locked(&dev->struct_mutex)); - - ret = __i915_wait_request(req, interruptible, NULL, NULL); - if (ret) - return ret; - - __i915_gem_request_retire__upto(req); - return 0; -} - /** * Ensures that all rendering to the object has completed and the object is * safe to unbind from the GTT or access from the CPU. @@ -1541,7 +1158,7 @@ i915_gem_object_retire_request(struct drm_i915_gem_object *obj, else if (obj->last_write_req == req) i915_gem_object_retire__write(obj); - __i915_gem_request_retire__upto(req); + i915_gem_request_retire__upto(req); } /* A nonblocking variant of the above wait. This is a highly dangerous routine @@ -2696,109 +2313,6 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv, } } -void i915_gem_request_free(struct kref *req_ref) -{ - struct drm_i915_gem_request *req = container_of(req_ref, - typeof(*req), ref); - struct intel_context *ctx = req->ctx; - - if (req->file_priv) - i915_gem_request_remove_from_client(req); - - if (ctx) { - if (i915.enable_execlists) { - if (ctx != req->ring->default_context) - intel_lr_context_unpin(req); - } - - i915_gem_context_unreference(ctx); - } - - kmem_cache_free(req->i915->requests, req); -} - -int i915_gem_request_alloc(struct intel_engine_cs *ring, - struct intel_context *ctx, - struct drm_i915_gem_request **req_out) -{ - struct drm_i915_private *dev_priv = to_i915(ring->dev); - unsigned reset_counter = i915_reset_counter(&dev_priv->gpu_error); - struct drm_i915_gem_request *req; - int ret; - - if (!req_out) - return -EINVAL; - - *req_out = NULL; - - /* ABI: Before userspace accesses the GPU (e.g. execbuffer), report - * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex - * and restart. - */ - ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible); - if (ret) - return ret; - - req = kmem_cache_zalloc(dev_priv->requests, GFP_KERNEL); - if (req == NULL) - return -ENOMEM; - - ret = i915_gem_get_seqno(ring->dev, &req->seqno); - if (ret) - goto err; - - kref_init(&req->ref); - req->i915 = dev_priv; - req->ring = ring; - req->reset_counter = reset_counter; - req->ctx = ctx; - i915_gem_context_reference(req->ctx); - - if (i915.enable_execlists) - ret = intel_logical_ring_alloc_request_extras(req); - else - ret = intel_ring_alloc_request_extras(req); - if (ret) { - i915_gem_context_unreference(req->ctx); - goto err; - } - - /* - * Reserve space in the ring buffer for all the commands required to - * eventually emit this request. This is to guarantee that the - * i915_add_request() call can't fail. Note that the reserve may need - * to be redone if the request is not actually submitted straight - * away, e.g. because a GPU scheduler has deferred it. - */ - if (i915.enable_execlists) - ret = intel_logical_ring_reserve_space(req); - else - ret = intel_ring_reserve_space(req); - if (ret) { - /* - * At this point, the request is fully allocated even if not - * fully prepared. Thus it can be cleaned up using the proper - * free code. - */ - i915_gem_request_cancel(req); - return ret; - } - - *req_out = req; - return 0; - -err: - kmem_cache_free(dev_priv->requests, req); - return ret; -} - -void i915_gem_request_cancel(struct drm_i915_gem_request *req) -{ - intel_ring_reserved_space_cancel(req->ringbuf); - - i915_gem_request_unreference(req); -} - struct drm_i915_gem_request * i915_gem_find_active_request(struct intel_engine_cs *ring) { @@ -2882,14 +2396,14 @@ static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv, * implicit references on things like e.g. ppgtt address spaces through * the request. */ - while (!list_empty(&ring->request_list)) { + if (!list_empty(&ring->request_list)) { struct drm_i915_gem_request *request; - request = list_first_entry(&ring->request_list, - struct drm_i915_gem_request, - list); + request = list_last_entry(&ring->request_list, + struct drm_i915_gem_request, + list); - i915_gem_request_retire(request); + i915_gem_request_retire__upto(request); } /* Having flushed all requests from all queues, we know that all @@ -2954,7 +2468,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring) if (!i915_gem_request_completed(request)) break; - i915_gem_request_retire(request); + i915_gem_request_retire__upto(request); } /* Move any buffers on the active list that are no longer referenced @@ -3077,7 +2591,7 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj) goto retire; if (i915_gem_request_completed(req)) { - __i915_gem_request_retire__upto(req); + i915_gem_request_retire__upto(req); retire: i915_gem_object_retire__read(obj, i); } diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c new file mode 100644 index 000000000000..28435c8d7b1f --- /dev/null +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -0,0 +1,513 @@ +/* + * Copyright © 2008-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "i915_drv.h" + +static int +i915_gem_check_wedge(unsigned reset_counter, bool interruptible) +{ + if (__i915_terminally_wedged(reset_counter)) + return -EIO; + + if (__i915_reset_in_progress(reset_counter)) { + /* Non-interruptible callers can't handle -EAGAIN, hence return + * -EIO unconditionally for these. */ + if (!interruptible) + return -EIO; + + return -EAGAIN; + } + + return 0; +} + +int i915_gem_request_alloc(struct intel_engine_cs *ring, + struct intel_context *ctx, + struct drm_i915_gem_request **req_out) +{ + struct drm_i915_private *dev_priv = to_i915(ring->dev); + unsigned reset_counter = i915_reset_counter(&dev_priv->gpu_error); + struct drm_i915_gem_request *req; + int ret; + + if (!req_out) + return -EINVAL; + + *req_out = NULL; + + /* ABI: Before userspace accesses the GPU (e.g. execbuffer), report + * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex + * and restart. + */ + ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible); + if (ret) + return ret; + + req = kmem_cache_zalloc(dev_priv->requests, GFP_KERNEL); + if (req == NULL) + return -ENOMEM; + + ret = i915_gem_get_seqno(ring->dev, &req->seqno); + if (ret) + goto err; + + kref_init(&req->ref); + req->i915 = dev_priv; + req->ring = ring; + req->reset_counter = reset_counter; + req->ctx = ctx; + i915_gem_context_reference(req->ctx); + + if (i915.enable_execlists) + ret = intel_logical_ring_alloc_request_extras(req); + else + ret = intel_ring_alloc_request_extras(req); + if (ret) { + i915_gem_context_unreference(req->ctx); + goto err; + } + + /* + * Reserve space in the ring buffer for all the commands required to + * eventually emit this request. This is to guarantee that the + * i915_add_request() call can't fail. Note that the reserve may need + * to be redone if the request is not actually submitted straight + * away, e.g. because a GPU scheduler has deferred it. + */ + if (i915.enable_execlists) + ret = intel_logical_ring_reserve_space(req); + else + ret = intel_ring_reserve_space(req); + if (ret) { + /* + * At this point, the request is fully allocated even if not + * fully prepared. Thus it can be cleaned up using the proper + * free code. + */ + i915_gem_request_cancel(req); + return ret; + } + + *req_out = req; + return 0; + +err: + kmem_cache_free(dev_priv->requests, req); + return ret; +} + +void i915_gem_request_cancel(struct drm_i915_gem_request *req) +{ + intel_ring_reserved_space_cancel(req->ringbuf); + + i915_gem_request_unreference(req); +} + +int i915_gem_request_add_to_client(struct drm_i915_gem_request *req, + struct drm_file *file) +{ + struct drm_i915_private *dev_private; + struct drm_i915_file_private *file_priv; + + WARN_ON(!req || !file || req->file_priv); + + if (!req || !file) + return -EINVAL; + + if (req->file_priv) + return -EINVAL; + + dev_private = req->ring->dev->dev_private; + file_priv = file->driver_priv; + + spin_lock(&file_priv->mm.lock); + req->file_priv = file_priv; + list_add_tail(&req->client_list, &file_priv->mm.request_list); + spin_unlock(&file_priv->mm.lock); + + req->pid = get_pid(task_pid(current)); + + return 0; +} + +static inline void +i915_gem_request_remove_from_client(struct drm_i915_gem_request *request) +{ + struct drm_i915_file_private *file_priv = request->file_priv; + + if (!file_priv) + return; + + spin_lock(&file_priv->mm.lock); + list_del(&request->client_list); + request->file_priv = NULL; + spin_unlock(&file_priv->mm.lock); + + put_pid(request->pid); + request->pid = NULL; +} + +static void i915_gem_request_retire(struct drm_i915_gem_request *request) +{ + trace_i915_gem_request_retire(request); + + /* We know the GPU must have read the request to have + * sent us the seqno + interrupt, so use the position + * of tail of the request to update the last known position + * of the GPU head. + * + * Note this requires that we are always called in request + * completion order. + */ + request->ringbuf->last_retired_head = request->postfix; + + list_del_init(&request->list); + i915_gem_request_remove_from_client(request); + + i915_gem_request_unreference(request); +} + +void +i915_gem_request_retire__upto(struct drm_i915_gem_request *req) +{ + struct intel_engine_cs *engine = req->ring; + struct drm_i915_gem_request *tmp; + + lockdep_assert_held(&engine->dev->struct_mutex); + + if (list_empty(&req->list)) + return; + + do { + tmp = list_first_entry(&engine->request_list, + typeof(*tmp), list); + + i915_gem_request_retire(tmp); + } while (tmp != req); + + WARN_ON(i915_verify_lists(engine->dev)); +} + +static unsigned long local_clock_us(unsigned *cpu) +{ + unsigned long t; + + /* Cheaply and approximately convert from nanoseconds to microseconds. + * The result and subsequent calculations are also defined in the same + * approximate microseconds units. The principal source of timing + * error here is from the simple truncation. + * + * Note that local_clock() is only defined wrt to the current CPU; + * the comparisons are no longer valid if we switch CPUs. Instead of + * blocking preemption for the entire busywait, we can detect the CPU + * switch and use that as indicator of system load and a reason to + * stop busywaiting, see busywait_stop(). + */ + *cpu = get_cpu(); + t = local_clock() >> 10; + put_cpu(); + + return t; +} + +static bool busywait_stop(unsigned long timeout, unsigned cpu) +{ + unsigned this_cpu; + + if (time_after(local_clock_us(&this_cpu), timeout)) + return true; + + return this_cpu != cpu; +} + +static bool __i915_spin_request(struct drm_i915_gem_request *req, + struct intel_wait *wait, + int state) +{ + unsigned long timeout; + unsigned cpu; + + /* When waiting for high frequency requests, e.g. during synchronous + * rendering split between the CPU and GPU, the finite amount of time + * required to set up the irq and wait upon it limits the response + * rate. By busywaiting on the request completion for a short while we + * can service the high frequency waits as quick as possible. However, + * if it is a slow request, we want to sleep as quickly as possible. + * The tradeoff between waiting and sleeping is roughly the time it + * takes to sleep on a request, on the order of a microsecond. + */ + + /* Only spin if we know the GPU is processing this request */ + if (!i915_gem_request_started(req)) + return false; + + timeout = local_clock_us(&cpu) + 5; + do { + if (i915_gem_request_completed(req)) + return true; + + if (signal_pending_state(state, wait->task)) + break; + + if (busywait_stop(timeout, cpu)) + break; + + cpu_relax_lowlatency(); + + /* Break the loop if we have consumed the timeslice (or been + * preempted) or when either the background thread has + * enabled the interrupt, or the IRQ has fired. + */ + } while (!need_resched() && wait->task->state == state); + + return false; +} + +/** + * __i915_wait_request - wait until execution of request has finished + * @req: duh! + * @interruptible: do an interruptible wait (normally yes) + * @timeout: in - how long to wait (NULL forever); out - how much time remaining + * + * Note: It is of utmost importance that the passed in seqno and reset_counter + * values have been read by the caller in an smp safe manner. Where read-side + * locks are involved, it is sufficient to read the reset_counter before + * unlocking the lock that protects the seqno. For lockless tricks, the + * reset_counter _must_ be read before, and an appropriate smp_rmb must be + * inserted. + * + * Returns 0 if the request was found within the alloted time. Else returns the + * errno with remaining time filled in timeout argument. + */ +int __i915_wait_request(struct drm_i915_gem_request *req, + bool interruptible, + s64 *timeout, + struct intel_rps_client *rps) +{ + int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; + struct intel_wait wait; + unsigned long timeout_remain; + int ret = 0; + + might_sleep(); + + if (list_empty(&req->list)) + return 0; + + if (i915_gem_request_completed(req)) + return 0; + + timeout_remain = MAX_SCHEDULE_TIMEOUT; + if (timeout) { + if (WARN_ON(*timeout < 0)) + return -EINVAL; + + if (*timeout == 0) + return -ETIME; + + /* Record current time in case interrupted, or wedged */ + timeout_remain = nsecs_to_jiffies_timeout(*timeout); + *timeout += ktime_get_raw_ns(); + } + + trace_i915_gem_request_wait_begin(req); + + /* This client is about to stall waiting for the GPU. In many cases + * this is undesirable and limits the throughput of the system, as + * many clients cannot continue processing user input/output whilst + * blocked. RPS autotuning may take tens of milliseconds to respond + * to the GPU load and thus incurs additional latency for the client. + * We can circumvent that by promoting the GPU frequency to maximum + * before we wait. This makes the GPU throttle up much more quickly + * (good for benchmarks and user experience, e.g. window animations), + * but at a cost of spending more power processing the workload + * (bad for battery). Not all clients even want their results + * immediately and for them we should just let the GPU select its own + * frequency to maximise efficiency. To prevent a single client from + * forcing the clocks too high for the whole system, we only allow + * each client to waitboost once in a busy period. + */ + if (INTEL_INFO(req->i915)->gen >= 6) + gen6_rps_boost(req->i915, rps, req->emitted_jiffies); + + intel_wait_init(&wait, req->seqno); + set_task_state(wait.task, state); + + /* Optimistic spin for the next ~jiffie before touching IRQs */ + if (intel_engine_add_wait(req->ring, &wait)) { + if (__i915_spin_request(req, &wait, state)) + goto out; + + intel_engine_enable_irq(req->ring); + + /* In order to check that we haven't missed the interrupt + * as we enabled it, we need to kick ourselves to do a + * coherent check on the seqno before we sleep. + */ + goto wakeup; + } + + for (;;) { + if (signal_pending_state(state, wait.task)) { + ret = -ERESTARTSYS; + break; + } + + /* Ensure that even if the GPU hangs, we get woken up. */ + i915_queue_hangcheck(req->i915); + + timeout_remain = io_schedule_timeout(timeout_remain); + if (timeout_remain == 0) { + ret = -ETIME; + break; + } + + if (intel_wait_complete(&wait)) + break; + +wakeup: set_task_state(wait.task, state); + + /* Before we do the heavier coherent read of the seqno, + * check the value (hopefully) in the CPU cacheline. + */ + if (i915_gem_request_completed(req)) + break; + + /* Ensure our read of the seqno is coherent so that we + * do not "miss an interrupt" (i.e. if this is the last + * request and the seqno write from the GPU is not visible + * by the time the interrupt fires, we will see that the + * request is incomplete and go back to sleep awaiting + * another interrupt that will never come.) + * + * Strictly, we only need to do this once after an interrupt, + * but it is easier and safer to do it every time the waiter + * is woken. + */ + if (req->ring->seqno_barrier) { + req->ring->seqno_barrier(req->ring); + if (i915_gem_request_completed(req)) + break; + } + + /* We need to check whether any gpu reset happened in between + * the request being submitted and now. If a reset has occurred, + * the request is effectively complete (we either are in the + * process of or have discarded the rendering and completely + * reset the GPU. The results of the request are lost and we + * are free to continue on with the original operation. + */ + if (req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) + break; + } +out: + intel_engine_remove_wait(req->ring, &wait); + __set_task_state(wait.task, TASK_RUNNING); + trace_i915_gem_request_wait_end(req); + + if (timeout) { + *timeout -= ktime_get_raw_ns(); + if (*timeout < 0) + *timeout = 0; + + /* + * Apparently ktime isn't accurate enough and occasionally has a + * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch + * things up to make the test happy. We allow up to 1 jiffy. + * + * This is a regrssion from the timespec->ktime conversion. + */ + if (ret == -ETIME && *timeout < jiffies_to_usecs(1)*1000) + *timeout = 0; + } + + if (ret == 0 && rps && req->seqno == req->ring->last_submitted_seqno) { + /* The GPU is now idle and this client has stalled. + * Since no other client has submitted a request in the + * meantime, assume that this client is the only one + * supplying work to the GPU but is unable to keep that + * work supplied because it is waiting. Since the GPU is + * then never kept fully busy, RPS autoclocking will + * keep the clocks relatively low, causing further delays. + * Compensate by giving the synchronous client credit for + * a waitboost next time. + */ + spin_lock(&req->i915->rps.client_lock); + list_del_init(&rps->link); + spin_unlock(&req->i915->rps.client_lock); + } + + return ret; +} + + +/** + * Waits for a request to be signaled, and cleans up the + * request and object lists appropriately for that event. + */ +int +i915_wait_request(struct drm_i915_gem_request *req) +{ + struct drm_device *dev; + struct drm_i915_private *dev_priv; + bool interruptible; + int ret; + + BUG_ON(req == NULL); + + dev = req->ring->dev; + dev_priv = dev->dev_private; + interruptible = dev_priv->mm.interruptible; + + BUG_ON(!mutex_is_locked(&dev->struct_mutex)); + + ret = __i915_wait_request(req, interruptible, NULL, NULL); + if (ret) + return ret; + + i915_gem_request_retire__upto(req); + return 0; +} + +void i915_gem_request_free(struct kref *req_ref) +{ + struct drm_i915_gem_request *req = container_of(req_ref, + typeof(*req), ref); + struct intel_context *ctx = req->ctx; + + if (req->file_priv) + i915_gem_request_remove_from_client(req); + + if (ctx) { + if (i915.enable_execlists) { + if (ctx != req->ring->default_context) + intel_lr_context_unpin(req); + } + + i915_gem_context_unreference(ctx); + } + + kmem_cache_free(req->i915->requests, req); +} + diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h new file mode 100644 index 000000000000..75c9015c19dc --- /dev/null +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -0,0 +1,209 @@ +/* + * Copyright © 2008-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef I915_GEM_REQUEST_H +#define I915_GEM_REQUEST_H + +/** + * Request queue structure. + * + * The request queue allows us to note sequence numbers that have been emitted + * and may be associated with active buffers to be retired. + * + * By keeping this list, we can avoid having to do questionable sequence + * number comparisons on buffer last_read|write_seqno. It also allows an + * emission time to be associated with the request for tracking how far ahead + * of the GPU the submission is. + * + * The requests are reference counted, so upon creation they should have an + * initial reference taken using kref_init + */ +struct drm_i915_gem_request { + struct kref ref; + + /** On Which ring this request was generated */ + struct drm_i915_private *i915; + struct intel_engine_cs *ring; + unsigned reset_counter; + + /** GEM sequence number associated with the previous request, + * when the HWS breadcrumb is equal to this the GPU is processing + * this request. + */ + u32 previous_seqno; + + /** GEM sequence number associated with this request, + * when the HWS breadcrumb is equal or greater than this the GPU + * has finished processing this request. + */ + u32 seqno; + + /** Position in the ringbuffer of the start of the request */ + u32 head; + + /** + * Position in the ringbuffer of the start of the postfix. + * This is required to calculate the maximum available ringbuffer + * space without overwriting the postfix. + */ + u32 postfix; + + /** Position in the ringbuffer of the end of the whole request */ + u32 tail; + + /** + * Context and ring buffer related to this request + * Contexts are refcounted, so when this request is associated with a + * context, we must increment the context's refcount, to guarantee that + * it persists while any request is linked to it. Requests themselves + * are also refcounted, so the request will only be freed when the last + * reference to it is dismissed, and the code in + * i915_gem_request_free() will then decrement the refcount on the + * context. + */ + struct intel_context *ctx; + struct intel_ringbuffer *ringbuf; + + /** Batch buffer related to this request if any (used for + error state dump only) */ + struct drm_i915_gem_object *batch_obj; + + /** Time at which this request was emitted, in jiffies. */ + unsigned long emitted_jiffies; + + /** global list entry for this request */ + struct list_head list; + + struct drm_i915_file_private *file_priv; + /** file_priv list entry for this request */ + struct list_head client_list; + + /** process identifier submitting this request */ + struct pid *pid; + + /** + * The ELSP only accepts two elements at a time, so we queue + * context/tail pairs on a given queue (ring->execlist_queue) until the + * hardware is available. The queue serves a double purpose: we also use + * it to keep track of the up to 2 contexts currently in the hardware + * (usually one in execution and the other queued up by the GPU): We + * only remove elements from the head of the queue when the hardware + * informs us that an element has been completed. + * + * All accesses to the queue are mediated by a spinlock + * (ring->execlist_lock). + */ + + /** Execlist link in the submission queue.*/ + struct list_head execlist_link; + + /** Execlists no. of times this request has been sent to the ELSP */ + int elsp_submitted; + +}; + +int i915_gem_request_alloc(struct intel_engine_cs *ring, + struct intel_context *ctx, + struct drm_i915_gem_request **req_out); +void i915_gem_request_cancel(struct drm_i915_gem_request *req); +void i915_gem_request_free(struct kref *req_ref); +int i915_gem_request_add_to_client(struct drm_i915_gem_request *req, + struct drm_file *file); +void i915_gem_request_retire__upto(struct drm_i915_gem_request *req); + +static inline uint32_t +i915_gem_request_get_seqno(struct drm_i915_gem_request *req) +{ + return req ? req->seqno : 0; +} + +static inline struct intel_engine_cs * +i915_gem_request_get_ring(struct drm_i915_gem_request *req) +{ + return req ? req->ring : NULL; +} + +static inline struct drm_i915_gem_request * +i915_gem_request_reference(struct drm_i915_gem_request *req) +{ + if (req) + kref_get(&req->ref); + return req; +} + +static inline void +i915_gem_request_unreference(struct drm_i915_gem_request *req) +{ + WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex)); + kref_put(&req->ref, i915_gem_request_free); +} + +static inline void +i915_gem_request_unreference__unlocked(struct drm_i915_gem_request *req) +{ + struct drm_device *dev; + + if (!req) + return; + + dev = req->ring->dev; + if (kref_put_mutex(&req->ref, i915_gem_request_free, &dev->struct_mutex)) + mutex_unlock(&dev->struct_mutex); +} + +static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst, + struct drm_i915_gem_request *src) +{ + if (src) + i915_gem_request_reference(src); + + if (*pdst) + i915_gem_request_unreference(*pdst); + + *pdst = src; +} + +/** + * Returns true if seq1 is later than seq2. + */ +static inline bool +i915_seqno_passed(uint32_t seq1, uint32_t seq2) +{ + return (int32_t)(seq1 - seq2) >= 0; +} + +static inline bool i915_gem_request_started(struct drm_i915_gem_request *req) +{ + return i915_seqno_passed(intel_ring_get_seqno(req->ring), + req->previous_seqno); +} + +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req) +{ + return i915_seqno_passed(intel_ring_get_seqno(req->ring), + req->seqno); +} + + +#endif /* I915_GEM_REQUEST_H */ -- 2.6.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx