From: Niranjana Vishwanathapura <niranjana.vishwanathapura@xxxxxxxxx>
Implement new execbuf3 ioctl (I915_GEM_EXECBUFFER3) which only
works in vm_bind mode. The vm_bind mode only works with
this new execbuf3 ioctl.
The new execbuf3 ioctl will not have any list of objects to validate
bind as all required objects binding would have been requested by the
userspace before submitting the execbuf3.
And the legacy support like relocations etc are removed.
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@xxxxxxxxx>
Signed-off-by: Ramalingam C <ramalingam.c@xxxxxxxxx>
Signed-off-by: Andi Shyti <andi.shyti@xxxxxxxxxxxxxxx>
---
drivers/gpu/drm/i915/Makefile | 1 +
.../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 1000 +++++++++++++++++
drivers/gpu/drm/i915/gem/i915_gem_ioctls.h | 2 +
include/uapi/drm/i915_drm.h | 62 +
4 files changed, 1065 insertions(+)
create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 4e1627e96c6e0..38cd1c5bc1a55 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -148,6 +148,7 @@ gem-y += \
gem/i915_gem_dmabuf.o \
gem/i915_gem_domain.o \
gem/i915_gem_execbuffer.o \
+ gem/i915_gem_execbuffer3.o \
gem/i915_gem_internal.o \
gem/i915_gem_object.o \
gem/i915_gem_lmem.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c
new file mode 100644
index 0000000000000..a3d767cd9f808
--- /dev/null
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c
@@ -0,0 +1,1000 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2022 Intel Corporation
+ */
+
+#include <linux/dma-resv.h>
+#include <linux/sync_file.h>
+#include <linux/uaccess.h>
+
+#include <drm/drm_syncobj.h>
+
+#include "gt/intel_context.h"
+#include "gt/intel_gpu_commands.h"
+#include "gt/intel_gt.h"
+#include "gt/intel_gt_pm.h"
+#include "gt/intel_ring.h"
+
+#include "i915_drv.h"
+#include "i915_file_private.h"
+#include "i915_gem_context.h"
+#include "i915_gem_ioctls.h"
+#include "i915_gem_vm_bind.h"
+#include "i915_trace.h"
+
+#define __EXEC3_ENGINE_PINNED BIT_ULL(32)
+#define __EXEC3_INTERNAL_FLAGS (~0ull << 32)
+
+/* Catch emission of unexpected errors for CI! */
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+#undef EINVAL
+#define EINVAL ({ \
+ DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \
+ 22; \
+})
+#endif
+
+/**
+ * DOC: User command execution with execbuf3 ioctl
+ *
+ * A VM in VM_BIND mode will not support older execbuf mode of binding.
+ * The execbuf ioctl handling in VM_BIND mode differs significantly from the
+ * older execbuf2 ioctl (See struct drm_i915_gem_execbuffer2).
+ * Hence, a new execbuf3 ioctl has been added to support VM_BIND mode. (See
+ * struct drm_i915_gem_execbuffer3). The execbuf3 ioctl will not accept any
+ * execlist. Hence, no support for implicit sync.
+ *
+ * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only
+ * works with execbuf3 ioctl for submission.
+ *
+ * The execbuf3 ioctl directly specifies the batch addresses instead of as
+ * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not
+ * support many of the older features like in/out/submit fences, fence array,
+ * default gem context etc. (See struct drm_i915_gem_execbuffer3).
+ *
+ * In VM_BIND mode, VA allocation is completely managed by the user instead of
+ * the i915 driver. Hence all VA assignment, eviction are not applicable in
+ * VM_BIND mode. Also, for determining object activeness, VM_BIND mode will not
+ * be using the i915_vma active reference tracking. It will instead check the
+ * dma-resv object's fence list for that.
+ *
+ * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions,
+ * vma lookup table, implicit sync, vma active reference tracking etc., are not
+ * applicable for execbuf3 ioctl.
+ */
+
+struct eb_fence {
+ struct drm_syncobj *syncobj;
+ struct dma_fence *dma_fence;
+ u64 value;
+ struct dma_fence_chain *chain_fence;
+};
+
+/**
+ * struct i915_execbuffer - execbuf struct for execbuf3
+ * @i915: reference to the i915 instance we run on
+ * @file: drm file reference
+ * args: execbuf3 ioctl structure
+ * @gt: reference to the gt instance ioctl submitted for
+ * @context: logical state for the request
+ * @gem_context: callers context
+ * @requests: requests to be build
+ * @composite_fence: used for excl fence in dma_resv objects when > 1 BB submitted
+ * @ww: i915_gem_ww_ctx instance
+ * @num_batches: number of batches submitted
+ * @batch_addresses: addresses corresponds to the submitted batches
+ * @batches: references to the i915_vmas corresponding to the batches
+ */
+struct i915_execbuffer {
+ struct drm_i915_private *i915;
+ struct drm_file *file;
+ struct drm_i915_gem_execbuffer3 *args;
+
+ struct intel_gt *gt;
+ struct intel_context *context;
+ struct i915_gem_context *gem_context;
+
+ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1];
+ struct dma_fence *composite_fence;
+
+ struct i915_gem_ww_ctx ww;
+
+ unsigned int num_batches;
+ u64 batch_addresses[MAX_ENGINE_INSTANCE + 1];
+ struct i915_vma *batches[MAX_ENGINE_INSTANCE + 1];
+
+ struct eb_fence *fences;
+ unsigned long num_fences;
+};
+
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
+
+static int eb_select_context(struct i915_execbuffer *eb)
+{
+ struct i915_gem_context *ctx;
+
+ ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->ctx_id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ eb->gem_context = ctx;
+ return 0;
+}
+
+static struct i915_vma *
+eb_find_vma(struct i915_address_space *vm, u64 addr)
+{
+ u64 va;
+
+ lockdep_assert_held(&vm->vm_bind_lock);
+
+ va = gen8_noncanonical_addr(addr & PIN_OFFSET_MASK);
+ return i915_gem_vm_bind_lookup_vma(vm, va);
+}
+
+static int eb_lookup_vma_all(struct i915_execbuffer *eb)
+{
+ unsigned int i, current_batch = 0;
+ struct i915_vma *vma;
+
+ for (i = 0; i < eb->num_batches; i++) {
+ vma = eb_find_vma(eb->context->vm, eb->batch_addresses[i]);
+ if (!vma)
+ return -EINVAL;
+
+ eb->batches[current_batch] = vma;
+ ++current_batch;
+ }
+
+ return 0;
+}
+
+static void eb_release_vma_all(struct i915_execbuffer *eb, bool final)
+{
+}
+
+static int eb_validate_vma_all(struct i915_execbuffer *eb)
+{
+ /* only throttle once, even if we didn't need to throttle */
+ for (bool throttle = true;; throttle = false) {
+ int err;
+
+ err = eb_pin_engine(eb, throttle);
+ if (!err)
+ return 0;
+
+ if (err != -EDEADLK)
+ return err;
+
+ err = i915_gem_ww_ctx_backoff(&eb->ww);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Using two helper loops for the order of which requests / batches are created
+ * and added the to backend. Requests are created in order from the parent to
+ * the last child. Requests are added in the reverse order, from the last child
+ * to parent. This is done for locking reasons as the timeline lock is acquired
+ * during request creation and released when the request is added to the
+ * backend. To make lockdep happy (see intel_context_timeline_lock) this must be
+ * the ordering.
+ */
+#define for_each_batch_create_order(_eb, _i) \
+ for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i))
+#define for_each_batch_add_order(_eb, _i) \
+ BUILD_BUG_ON(!typecheck(int, _i)); \
+ for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i))
+
+static int eb_move_to_gpu(struct i915_execbuffer *eb)
+{
+ /* Unconditionally flush any chipset caches (for streaming writes). */
+ intel_gt_chipset_flush(eb->gt);
+
+ return 0;
+}
+
+static int eb_request_submit(struct i915_execbuffer *eb,
+ struct i915_request *rq,
+ struct i915_vma *batch,
+ u64 batch_len)
+{
+ struct intel_engine_cs *engine = rq->context->engine;
+ int err;
+
+ if (intel_context_nopreempt(rq->context))
+ __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
+
+ /*
+ * After we completed waiting for other engines (using HW semaphores)
+ * then we can signal that this request/batch is ready to run. This
+ * allows us to determine if the batch is still waiting on the GPU
+ * or actually running by checking the breadcrumb.
+ */
+ if (engine->emit_init_breadcrumb) {
+ err = engine->emit_init_breadcrumb(rq);
+ if (err)
+ return err;
+ }
+
+ return engine->emit_bb_start(rq, batch->node.start, batch_len, 0);
+}
+
+static int eb_submit(struct i915_execbuffer *eb)
+{
+ unsigned int i;
+ int err;
+
+ err = eb_move_to_gpu(eb);
+
+ for_each_batch_create_order(eb, i) {
+ if (!eb->requests[i])
+ break;
+
+ trace_i915_request_queue(eb->requests[i], 0);
+ if (!err)
+ err = eb_request_submit(eb, eb->requests[i],
+ eb->batches[i],
+ eb->batches[i]->size);
+ }
+
+ return err;
+}
+
+static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel_context *ce)
+{
+ struct intel_ring *ring = ce->ring;
+ struct intel_timeline *tl = ce->timeline;
+ struct i915_request *rq;
+
+ /*
+ * Completely unscientific finger-in-the-air estimates for suitable
+ * maximum user request size (to avoid blocking) and then backoff.
+ */
+ if (intel_ring_update_space(ring) >= PAGE_SIZE)
+ return NULL;
+
+ /*
+ * Find a request that after waiting upon, there will be at least half
+ * the ring available. The hysteresis allows us to compete for the
+ * shared ring and should mean that we sleep less often prior to
+ * claiming our resources, but not so long that the ring completely
+ * drains before we can submit our next request.
+ */
+ list_for_each_entry(rq, &tl->requests, link) {
+ if (rq->ring != ring)
+ continue;
+
+ if (__intel_ring_space(rq->postfix,
+ ring->emit, ring->size) > ring->size / 2)
+ break;
+ }
+ if (&rq->link == &tl->requests)
+ return NULL; /* weird, we will check again later for real */
+
+ return i915_request_get(rq);
+}
+
+static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce,
+ bool throttle)
+{
+ struct intel_timeline *tl;
+ struct i915_request *rq = NULL;
+
+ /*
+ * Take a local wakeref for preparing to dispatch the execbuf as
+ * we expect to access the hardware fairly frequently in the
+ * process, and require the engine to be kept awake between accesses.
+ * Upon dispatch, we acquire another prolonged wakeref that we hold
+ * until the timeline is idle, which in turn releases the wakeref
+ * taken on the engine, and the parent device.
+ */
+ tl = intel_context_timeline_lock(ce);
+ if (IS_ERR(tl))
+ return PTR_ERR(tl);
+
+ intel_context_enter(ce);
+ if (throttle)
+ rq = eb_throttle(eb, ce);
+ intel_context_timeline_unlock(tl);
+
+ if (rq) {
+ bool nonblock = eb->file->filp->f_flags & O_NONBLOCK;
+ long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT;
+
+ if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
+ timeout) < 0) {
+ i915_request_put(rq);
+
+ /*
+ * Error path, cannot use intel_context_timeline_lock as
+ * that is user interruptable and this clean up step
+ * must be done.
+ */
+ mutex_lock(&ce->timeline->mutex);
+ intel_context_exit(ce);
+ mutex_unlock(&ce->timeline->mutex);
+
+ if (nonblock)
+ return -EWOULDBLOCK;
+ else
+ return -EINTR;
+ }
+ i915_request_put(rq);
+ }
+
+ return 0;
+}
+
+static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
+{
+ struct intel_context *ce = eb->context, *child;
+ int err;
+ int i = 0, j = 0;
+
+ GEM_BUG_ON(eb->args->flags & __EXEC3_ENGINE_PINNED);
+
+ if (unlikely(intel_context_is_banned(ce)))
+ return -EIO;
+
+ /*
+ * Pinning the contexts may generate requests in order to acquire
+ * GGTT space, so do this first before we reserve a seqno for
+ * ourselves.
+ */
+ err = intel_context_pin_ww(ce, &eb->ww);
+ if (err)
+ return err;
+
+ for_each_child(ce, child) {
+ err = intel_context_pin_ww(child, &eb->ww);
+ GEM_BUG_ON(err); /* perma-pinned should incr a counter */
+ }
+
+ for_each_child(ce, child) {
+ err = eb_pin_timeline(eb, child, throttle);
+ if (err)
+ goto unwind;
+ ++i;
+ }
+ err = eb_pin_timeline(eb, ce, throttle);
+ if (err)
+ goto unwind;
+
+ eb->args->flags |= __EXEC3_ENGINE_PINNED;
+ return 0;
+
+unwind:
+ for_each_child(ce, child) {
+ if (j++ < i) {
+ mutex_lock(&child->timeline->mutex);
+ intel_context_exit(child);
+ mutex_unlock(&child->timeline->mutex);
+ }
+ }
+ for_each_child(ce, child)
+ intel_context_unpin(child);
+ intel_context_unpin(ce);
+ return err;
+}
+
+static int
+eb_select_engine(struct i915_execbuffer *eb)
+{
+ struct intel_context *ce, *child;
+ unsigned int idx;
+ int err;
+
+ if (!i915_gem_context_user_engines(eb->gem_context))
+ return -EINVAL;
+
+ idx = eb->args->engine_idx;
+ ce = i915_gem_context_get_engine(eb->gem_context, idx);
+ if (IS_ERR(ce))
+ return PTR_ERR(ce);
+
+ eb->num_batches = ce->parallel.number_children + 1;
+
+ for_each_child(ce, child)
+ intel_context_get(child);
+ intel_gt_pm_get(ce->engine->gt);
+
+ if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
+ err = intel_context_alloc_state(ce);
+ if (err)
+ goto err;
+ }
+ for_each_child(ce, child) {
+ if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) {
+ err = intel_context_alloc_state(child);
+ if (err)
+ goto err;
+ }
+ }
+
+ /*
+ * ABI: Before userspace accesses the GPU (e.g. execbuffer), report
+ * EIO if the GPU is already wedged.
+ */
+ err = intel_gt_terminally_wedged(ce->engine->gt);
+ if (err)
+ goto err;
+
+ if (!i915_vm_tryget(ce->vm)) {
+ err = -ENOENT;
+ goto err;
+ }
+
+ eb->context = ce;
+ eb->gt = ce->engine->gt;
+
+ /*
+ * Make sure engine pool stays alive even if we call intel_context_put
+ * during ww handling. The pool is destroyed when last pm reference
+ * is dropped, which breaks our -EDEADLK handling.
+ */
+ return err;
+
+err:
+ intel_gt_pm_put(ce->engine->gt);
+ for_each_child(ce, child)
+ intel_context_put(child);
+ intel_context_put(ce);
+ return err;
+}
+
+static void
+eb_put_engine(struct i915_execbuffer *eb)
+{
+ struct intel_context *child;
+
+ i915_vm_put(eb->context->vm);
+ intel_gt_pm_put(eb->gt);
+ for_each_child(eb->context, child)
+ intel_context_put(child);
+ intel_context_put(eb->context);
+}
+
+static void
+__free_fence_array(struct eb_fence *fences, unsigned int n)
+{
+ while (n--) {
+ drm_syncobj_put(ptr_mask_bits(fences[n].syncobj, 2));
+ dma_fence_put(fences[n].dma_fence);
+ dma_fence_chain_free(fences[n].chain_fence);
+ }
+ kvfree(fences);
+}
+
+static int add_timeline_fence_array(struct i915_execbuffer *eb)
+{
+ struct drm_i915_gem_timeline_fence __user *user_fences;
+ struct eb_fence *f;
+ u64 nfences;
+ int err = 0;
+
+ nfences = eb->args->fence_count;
+ if (!nfences)
+ return 0;
+
+ /* Check multiplication overflow for access_ok() and kvmalloc_array() */
+ BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long));
+ if (nfences > min_t(unsigned long,
+ ULONG_MAX / sizeof(*user_fences),
+ SIZE_MAX / sizeof(*f)) - eb->num_fences)
+ return -EINVAL;
+
+ user_fences = u64_to_user_ptr(eb->args->timeline_fences);
+ if (!access_ok(user_fences, nfences * sizeof(*user_fences)))
+ return -EFAULT;
+
+ f = krealloc(eb->fences,
+ (eb->num_fences + nfences) * sizeof(*f),
+ __GFP_NOWARN | GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ eb->fences = f;
+ f += eb->num_fences;
+
+ BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) &
+ ~__I915_TIMELINE_FENCE_UNKNOWN_FLAGS);
+
+ while (nfences--) {
+ struct drm_i915_gem_timeline_fence user_fence;
+ struct drm_syncobj *syncobj;
+ struct dma_fence *fence = NULL;
+ u64 point;
+
+ if (__copy_from_user(&user_fence,
+ user_fences++,
+ sizeof(user_fence)))
+ return -EFAULT;
+
+ if (user_fence.flags & __I915_TIMELINE_FENCE_UNKNOWN_FLAGS)
+ return -EINVAL;
+
+ syncobj = drm_syncobj_find(eb->file, user_fence.handle);
+ if (!syncobj) {
+ DRM_DEBUG("Invalid syncobj handle provided\n");
+ return -ENOENT;
+ }
+
+ fence = drm_syncobj_fence_get(syncobj);
+
+ if (!fence && user_fence.flags &&
+ !(user_fence.flags & I915_TIMELINE_FENCE_SIGNAL)) {
+ DRM_DEBUG("Syncobj handle has no fence\n");
+ drm_syncobj_put(syncobj);
+ return -EINVAL;
+ }
+
+ point = user_fence.value;
+ if (fence)
+ err = dma_fence_chain_find_seqno(&fence, point);
+
+ if (err && !(user_fence.flags & I915_TIMELINE_FENCE_SIGNAL)) {
+ DRM_DEBUG("Syncobj handle missing requested point %llu\n", point);
+ dma_fence_put(fence);
+ drm_syncobj_put(syncobj);
+ return err;
+ }
+
+ /*
+ * A point might have been signaled already and
+ * garbage collected from the timeline. In this case
+ * just ignore the point and carry on.
+ */
+ if (!fence && !(user_fence.flags & I915_TIMELINE_FENCE_SIGNAL)) {
+ drm_syncobj_put(syncobj);
+ continue;
+ }
+
+ /*
+ * For timeline syncobjs we need to preallocate chains for
+ * later signaling.
+ */
+ if (point != 0 && user_fence.flags & I915_TIMELINE_FENCE_SIGNAL) {
+ /*
+ * Waiting and signaling the same point (when point !=
+ * 0) would break the timeline.
+ */
+ if (user_fence.flags & I915_TIMELINE_FENCE_WAIT) {
+ DRM_DEBUG("Trying to wait & signal the same timeline point.\n");
+ dma_fence_put(fence);
+ drm_syncobj_put(syncobj);
+ return -EINVAL;
+ }
+
+ f->chain_fence = dma_fence_chain_alloc();
+ if (!f->chain_fence) {
+ drm_syncobj_put(syncobj);
+ dma_fence_put(fence);
+ return -ENOMEM;
+ }
+ } else {
+ f->chain_fence = NULL;
+ }
+
+ f->syncobj = ptr_pack_bits(syncobj, user_fence.flags, 2);
+ f->dma_fence = fence;
+ f->value = point;
+ f++;
+ eb->num_fences++;
+ }
+
+ return 0;
+}
+
+static void put_fence_array(struct eb_fence *fences, int num_fences)
+{
+ if (fences)
+ __free_fence_array(fences, num_fences);
+}
+
+static int
+await_fence_array(struct i915_execbuffer *eb,
+ struct i915_request *rq)
+{
+ unsigned int n;
+
+ for (n = 0; n < eb->num_fences; n++) {
+ int err;
+
+ struct drm_syncobj *syncobj;
+ unsigned int flags;
+
+ syncobj = ptr_unpack_bits(eb->fences[n].syncobj, &flags, 2);
+
+ if (!eb->fences[n].dma_fence)
+ continue;
+
+ err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static void signal_fence_array(const struct i915_execbuffer *eb,
+ struct dma_fence * const fence)
+{
+ unsigned int n;
+
+ for (n = 0; n < eb->num_fences; n++) {
+ struct drm_syncobj *syncobj;
+ unsigned int flags;
+
+ syncobj = ptr_unpack_bits(eb->fences[n].syncobj, &flags, 2);
+ if (!(flags & I915_TIMELINE_FENCE_SIGNAL))
+ continue;
+
+ if (eb->fences[n].chain_fence) {
+ drm_syncobj_add_point(syncobj,
+ eb->fences[n].chain_fence,
+ fence,
+ eb->fences[n].value);
+ /*
+ * The chain's ownership is transferred to the
+ * timeline.
+ */
+ eb->fences[n].chain_fence = NULL;
+ } else {
+ drm_syncobj_replace_fence(syncobj, fence);
+ }
+ }
+}