Doesn't work yet, but shows the direction I'm going. Instead of relying on struct_mutex, we rely on the object locks to paralellize submission. Currently we still have some issues because eb_lookup_vmas() pins the vma, and parallel submission doesn't work yet completely. I'm hoping to find out a good way to handle this. Ideally we drop the pinning, and rely only on object lock, but unfortunately the driver doesn't support it yet. Posting this as a RFC, to get some feedback on the direction we should be going. Signed-off-by: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxxxx> --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 295 ++++++++++-------- drivers/gpu/drm/i915/i915_cmd_parser.c | 15 +- drivers/gpu/drm/i915/i915_drv.h | 6 - 3 files changed, 171 insertions(+), 145 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 02fdf081c1be..ee753684f0ba 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -234,6 +234,8 @@ struct i915_execbuffer { struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ struct eb_vma *vma; + struct i915_gem_ww_ctx ww; + struct intel_engine_cs *engine; /** engine to queue the request to */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ @@ -286,6 +288,9 @@ struct i915_execbuffer { struct hlist_head *buckets; /** ht for relocation handles */ }; +static int eb_parse(struct i915_execbuffer *eb, + struct intel_engine_pool_node *pool); + /* * Used to convert any address to canonical form. * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS, @@ -504,8 +509,10 @@ eb_add_vma(struct i915_execbuffer *eb, if (!(eb->args->flags & __EXEC_VALIDATED)) { err = eb_validate_vma(eb, entry, vma); - if (unlikely(err)) + if (unlikely(err)) { + DRM_DEBUG_DRIVER("eb_validate_vma => %i\n", err); return err; + } } ev->vma = vma; @@ -551,8 +558,10 @@ eb_add_vma(struct i915_execbuffer *eb, eb_unreserve_vma(ev); list_add_tail(&ev->bind_link, &eb->unbound); - if (drm_mm_node_allocated(&vma->node)) + if (drm_mm_node_allocated(&vma->node)) { err = i915_vma_unbind(vma); + WARN(err, "i915_vma_unbind => %i\n", err); + } } return err; } @@ -574,7 +583,7 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache, obj->cache_level != I915_CACHE_NONE); } -static int eb_reserve_vma(const struct i915_execbuffer *eb, struct eb_vma *ev) +static int eb_reserve_vma(struct i915_execbuffer *eb, struct eb_vma *ev) { struct drm_i915_gem_exec_object2 *entry = ev->exec; unsigned int exec_flags = ev->flags; @@ -582,6 +591,10 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb, struct eb_vma *ev) u64 pin_flags; int err; + err = i915_gem_object_lock(&eb->ww, vma->obj); + if (err) + return err; + pin_flags = PIN_USER | PIN_NONBLOCK; if (exec_flags & EXEC_OBJECT_NEEDS_GTT) pin_flags |= PIN_GLOBAL; @@ -797,9 +810,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) lut->handle = handle; lut->ctx = eb->gem_context; - i915_gem_object_lock(NULL, obj); list_add(&lut->obj_link, &obj->lut_list); - i915_gem_object_unlock(obj); add_vma: err = eb_add_vma(eb, i, batch, vma); @@ -812,15 +823,33 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) mutex_unlock(&eb->gem_context->mutex); + if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { + DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); + return -EINVAL; + } + + if (range_overflows_t(u64, + eb->batch_start_offset, eb->batch_len, + eb->batch->vma->size)) { + DRM_DEBUG("Attempting to use out-of-bounds batch\n"); + return -EINVAL; + } + + if (eb->batch_len == 0) + eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; + eb->args->flags |= __EXEC_VALIDATED; return eb_reserve(eb); err_obj: i915_gem_object_put(obj); + DRM_DEBUG_DRIVER("err_obj() => %i\n", err); err_vma: eb->vma[i].vma = NULL; + DRM_DEBUG_DRIVER("err_vma() => %i\n", err); err_ctx: mutex_unlock(&eb->gem_context->mutex); + DRM_DEBUG_DRIVER("err_ctx() => %i\n", err); return err; } @@ -866,6 +895,15 @@ static void eb_release_vmas(const struct i915_execbuffer *eb) } } +static void eb_unreserve_vmas(struct i915_execbuffer *eb) +{ + const unsigned int count = eb->buffer_count; + unsigned int i; + + for (i = 0; i < count; i++) + eb_unreserve_vma(&eb->vma[i]); +} + static void eb_reset_vmas(const struct i915_execbuffer *eb) { eb_release_vmas(eb); @@ -1121,7 +1159,7 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) struct drm_i915_gem_object *obj = vma->obj; int err; - i915_vma_lock(vma); + assert_vma_held(vma); if (obj->cache_dirty & ~obj->cache_coherent) i915_gem_clflush_object(obj, 0); @@ -1131,8 +1169,6 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); - return err; } @@ -1190,11 +1226,10 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, if (err) goto skip_request; - i915_vma_lock(batch); + assert_vma_held(batch); err = i915_request_await_object(rq, batch->obj, false); if (err == 0) err = i915_vma_move_to_active(batch, rq, 0); - i915_vma_unlock(batch); if (err) goto skip_request; @@ -1445,6 +1480,11 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) struct drm_i915_gem_relocation_entry __user *urelocs; const struct drm_i915_gem_exec_object2 *entry = ev->exec; unsigned int remain; + int err; + + err = i915_gem_object_lock(&eb->ww, ev->vma->obj); + if (err) + return err; urelocs = u64_to_user_ptr(entry->relocs_ptr); remain = entry->relocation_count; @@ -1675,14 +1715,35 @@ static int eb_prefault_relocations(const struct i915_execbuffer *eb) return 0; } +static int eb_lock_and_parse_cmdbuffer(struct i915_execbuffer *eb) +{ + struct intel_engine_pool_node *pool; + int err; + + if (!eb_use_cmdparser(eb)) + return 0; + + pool = intel_engine_get_pool(eb->engine, eb->batch_len); + if (IS_ERR(pool)) + return PTR_ERR(pool); + + err = i915_gem_object_lock(&eb->ww, pool->obj); + if (!err) + err = eb_parse(eb, pool); + + if (err) + intel_engine_pool_put(pool); + + return err; +} + static noinline int eb_relocate_slow(struct i915_execbuffer *eb) { - struct drm_device *dev = &eb->i915->drm; bool have_copy = false; struct eb_vma *ev; int err = 0; -repeat: +repeat_pass: if (signal_pending(current)) { err = -ERESTARTSYS; goto out; @@ -1690,10 +1751,10 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) /* We may process another execbuffer during the unlock... */ eb_reset_vmas(eb); - mutex_unlock(&dev->struct_mutex); + i915_gem_ww_ctx_fini(&eb->ww); /* - * We take 3 passes through the slowpatch. + * We take 3 passes through the slowpath. * * 1 - we try to just prefault all the user relocation entries and * then attempt to reuse the atomic pagefault disabled fast path again. @@ -1714,20 +1775,14 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) cond_resched(); err = 0; } - if (err) { - mutex_lock(&dev->struct_mutex); + i915_gem_ww_ctx_init(&eb->ww, true); + if (err) goto out; - } /* A frequent cause for EAGAIN are currently unavailable client pages */ flush_workqueue(eb->i915->mm.userptr_wq); - err = i915_mutex_lock_interruptible(dev); - if (err) { - mutex_lock(&dev->struct_mutex); - goto out; - } - +repeat_reloc: /* reacquire the objects */ err = eb_lookup_vmas(eb); if (err) @@ -1740,8 +1795,10 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) pagefault_disable(); err = eb_relocate_vma(eb, ev); pagefault_enable(); - if (err) - goto repeat; + if (err == -EDEADLK) + goto err; + else if (err) + goto repeat_pass; } else { err = eb_relocate_vma_slow(eb, ev); if (err) @@ -1749,6 +1806,9 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) } } + if (!err) + err = eb_lock_and_parse_cmdbuffer(eb); + /* * Leave the user relocations as are, this is the painfully slow path, * and we want to avoid the complication of dropping the lock whilst @@ -1757,8 +1817,15 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) */ err: + if (err == -EDEADLK) { + eb_reset_vmas(eb); + + err = i915_gem_ww_ctx_backoff(&eb->ww); + if (!err) + goto repeat_reloc; + } if (err == -EAGAIN) - goto repeat; + goto repeat_pass; out: if (have_copy) { @@ -1781,60 +1848,79 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb) return err; } -static int eb_relocate(struct i915_execbuffer *eb) +static int eb_relocate_fast(struct i915_execbuffer *eb) { - if (eb_lookup_vmas(eb)) - goto slow; + int err; /* The objects are in their final locations, apply the relocations. */ if (eb->args->flags & __EXEC_HAS_RELOC) { struct eb_vma *ev; list_for_each_entry(ev, &eb->relocs, reloc_link) { - if (eb_relocate_vma(eb, ev)) - goto slow; + err = eb_relocate_vma(eb, ev); + if (err) + return err; } } return 0; - -slow: - return eb_relocate_slow(eb); } -static int eb_move_to_gpu(struct i915_execbuffer *eb) +static int eb_relocate_and_parse_cmdbuf_backoff(struct i915_execbuffer *eb) { - const unsigned int count = eb->buffer_count; - struct ww_acquire_ctx acquire; - unsigned int i; - int err = 0; + int err; - ww_acquire_init(&acquire, &reservation_ww_class); + err = eb_lookup_vmas(eb); + if (err) { + DRM_DEBUG_DRIVER("eb_lookup_vmas() => %i\n", err); + return err; + } - for (i = 0; i < count; i++) { - struct eb_vma *ev = &eb->vma[i]; - struct i915_vma *vma = ev->vma; + DRM_DEBUG_DRIVER("eb_reserve() => %i\n", err); - err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); - if (err == -EDEADLK) { - GEM_BUG_ON(i == 0); - do { - int j = i - 1; +retry_fast: + if (!err) + err = eb_relocate_fast(eb); + if (!err) { + err = eb_lock_and_parse_cmdbuffer(eb); + DRM_DEBUG_DRIVER("Parsing cmdbuf returns %i\n", err); - ww_mutex_unlock(&eb->vma[j].vma->resv->lock); + /* return on success, or any error except -EDEADLK */ + if (err != -EDEADLK) + return err; + } - swap(eb->vma[i], eb->vma[j]); - } while (--i); + if (err == -EDEADLK) { + eb_unreserve_vmas(eb); - err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, - &acquire); - } - if (err == -EALREADY) - err = 0; + err = i915_gem_ww_ctx_backoff(&eb->ww); if (err) - break; + return err; + + goto retry_fast; } - ww_acquire_done(&acquire); + + err = eb_relocate_slow(eb); + if (err) + /* + * If the user expects the execobject.offset and + * reloc.presumed_offset to be an exact match, + * as for using NO_RELOC, then we cannot update + * the execobject.offset until we have completed + * relocation. + */ + eb->args->flags &= ~__EXEC_HAS_RELOC; + + DRM_DEBUG_DRIVER("Slow relocation returns %i\n", err); + return err; +} + +static int eb_move_to_gpu(struct i915_execbuffer *eb) +{ + unsigned int i; + int err = 0; + + ww_acquire_done(&eb->ww.ctx); while (i--) { struct eb_vma *ev = &eb->vma[i]; @@ -1880,15 +1966,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) if (err == 0) err = i915_vma_move_to_active(vma, eb->request, flags); - i915_vma_unlock(vma); - __eb_unreserve_vma(vma, flags); if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) i915_vma_put(vma); ev->vma = NULL; } - ww_acquire_fini(&acquire); if (unlikely(err)) goto err_skip; @@ -1990,21 +2073,17 @@ shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) return vma; } -static struct i915_vma *eb_parse(struct i915_execbuffer *eb) +static int eb_parse(struct i915_execbuffer *eb, + struct intel_engine_pool_node *pool) { - struct intel_engine_pool_node *pool; struct i915_vma *vma; u64 batch_start; u64 shadow_batch_start; int err; - pool = intel_engine_get_pool(eb->engine, eb->batch_len); - if (IS_ERR(pool)) - return ERR_CAST(pool); - vma = shadow_batch_pin(eb, pool->obj); if (IS_ERR(vma)) - goto err; + return PTR_ERR(vma); batch_start = gen8_canonical_addr(eb->batch->vma->node.start) + eb->batch_start_offset; @@ -2028,12 +2107,13 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) * For PPGTT backing however, we have no choice but to forcibly * reject unsafe buffers */ - if (i915_vma_is_ggtt(vma) && err == -EACCES) + if (i915_vma_is_ggtt(vma) && err == -EACCES) { /* Execute original buffer non-secure */ - vma = NULL; - else - vma = ERR_PTR(err); - goto err; + intel_engine_pool_put(pool); + return 0; + } + + return err; } eb->vma[eb->buffer_count].vma = i915_vma_get(vma); @@ -2049,11 +2129,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb) /* eb->batch_len unchanged */ vma->private = pool; - return vma; - -err: - intel_engine_pool_put(pool); - return vma; + return 0; } static void @@ -2485,6 +2561,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.batch_len = args->batch_len; eb.batch_flags = 0; + if (args->flags & I915_EXEC_SECURE) { if (INTEL_GEN(i915) >= 11) return -ENODEV; @@ -2529,68 +2606,36 @@ i915_gem_do_execbuffer(struct drm_device *dev, } err = eb_create(&eb); - if (err) + if (err) { + DRM_DEBUG_DRIVER("eb_create() => %i\n", err); goto err_out_fence; + } GEM_BUG_ON(!eb.lut_size); err = eb_select_context(&eb); - if (unlikely(err)) + if (unlikely(err)) { + DRM_DEBUG_DRIVER("eb_select_context() => %i\n", err); goto err_destroy; + } err = eb_pin_engine(&eb, file, args); - if (unlikely(err)) + if (unlikely(err)) { + DRM_DEBUG_DRIVER("eb_pin_engine() => %i\n", err); goto err_context; - - err = i915_mutex_lock_interruptible(dev); - if (err) - goto err_engine; - - err = eb_relocate(&eb); - if (err) { - /* - * If the user expects the execobject.offset and - * reloc.presumed_offset to be an exact match, - * as for using NO_RELOC, then we cannot update - * the execobject.offset until we have completed - * relocation. - */ - args->flags &= ~__EXEC_HAS_RELOC; - goto err_vma; } - if (unlikely(eb.batch->flags & EXEC_OBJECT_WRITE)) { - DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); - err = -EINVAL; - goto err_vma; - } + i915_gem_ww_ctx_init(&eb.ww, true); - batch = eb.batch->vma; - if (range_overflows_t(u64, - eb.batch_start_offset, eb.batch_len, - batch->size)) { - DRM_DEBUG("Attempting to use out-of-bounds batch\n"); - err = -EINVAL; + err = eb_relocate_and_parse_cmdbuf_backoff(&eb); + if (err) goto err_vma; - } - - if (eb.batch_len == 0) - eb.batch_len = eb.batch->vma->size - eb.batch_start_offset; - - if (eb_use_cmdparser(&eb)) { - struct i915_vma *vma; - - vma = eb_parse(&eb); - if (IS_ERR(vma)) { - err = PTR_ERR(vma); - goto err_vma; - } - } /* * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure * batch" bit. Hence we need to pin secure batches into the global gtt. * hsw should have this fixed, but bdw mucks it up again. */ + batch = eb.batch->vma; if (eb.batch_flags & I915_DISPATCH_SECURE) { struct i915_vma *vma; @@ -2607,7 +2652,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, vma = i915_gem_object_ggtt_pin(batch->obj, NULL, 0, 0, 0); if (IS_ERR(vma)) { err = PTR_ERR(vma); - goto err_vma; + goto err_pool; } batch = vma; @@ -2684,13 +2729,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, err_batch_unpin: if (eb.batch_flags & I915_DISPATCH_SECURE) i915_vma_unpin(batch); +err_pool: if (batch->private) intel_engine_pool_put(batch->private); err_vma: if (eb.exec) eb_release_vmas(&eb); - mutex_unlock(&dev->struct_mutex); -err_engine: + i915_gem_ww_ctx_fini(&eb.ww); eb_unpin_engine(&eb); err_context: i915_gem_context_put(eb.gem_context); diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index c5b9ec5b3d25..f5e821bf5d59 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1136,31 +1136,19 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, void *dst, *src; int ret; - ret = i915_gem_object_lock_interruptible(NULL, dst_obj); + ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush); if (ret) return ERR_PTR(ret); - ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush); - if (ret) { - i915_gem_object_unlock(dst_obj); - return ERR_PTR(ret); - } - dst = i915_gem_object_pin_map(dst_obj, I915_MAP_FORCE_WB); i915_gem_object_finish_access(dst_obj); - i915_gem_object_unlock(dst_obj); if (IS_ERR(dst)) return dst; - ret = i915_gem_object_lock_interruptible(NULL, src_obj); - if (ret) - return ERR_PTR(ret); - ret = i915_gem_object_prepare_read(src_obj, &src_needs_clflush); if (ret) { i915_gem_object_unpin_map(dst_obj); - i915_gem_object_unlock(src_obj); return ERR_PTR(ret); } @@ -1209,7 +1197,6 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, } i915_gem_object_finish_access(src_obj); - i915_gem_object_unlock(src_obj); /* dst_obj is returned with vmap pinned */ *needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c3d8af28bfc1..ab8cb0c851f6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1848,12 +1848,6 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv); -static inline int __must_check -i915_mutex_lock_interruptible(struct drm_device *dev) -{ - return mutex_lock_interruptible(&dev->struct_mutex); -} - int i915_gem_dumb_create(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args); -- 2.24.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx