Some users require that when a master batch is executed on one particular engine, a companion batch is run simultaneously on a specific slave engine. For this purpose, we introduce virtual engine bonding, allowing maps of master:slaves to be constructed to constrain which physical engines a virtual engine may select given a fence on a master engine. For the moment, we continue to ignore the issue of preemption deferring the master request for later. Ideally, we would like to then also remove the slave and run something else rather than have it stall the pipeline. With load balancing, we should be able to move workload around it, but there is a similar stall on the master pipeline while it may wait for the slave to be executed. At the cost of more latency for the bonded request, it may be interesting to launch both on their engines in lockstep. (Bubbles abound.) Opens: Also what about bonding an engine as its own master? It doesn't break anything internally, so allow the silliness. v2: Emancipate the bonds v3: Couple in delayed scheduling for the selftests v4: Handle invalid mutually exclusive bonding v5: Mention what the uapi does v6: s/nbond/num_bonds/ Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem_context.c | 50 +++++ drivers/gpu/drm/i915/i915_request.c | 1 + drivers/gpu/drm/i915/i915_request.h | 3 + drivers/gpu/drm/i915/intel_engine_types.h | 7 + drivers/gpu/drm/i915/intel_lrc.c | 152 ++++++++++++++ drivers/gpu/drm/i915/intel_lrc.h | 4 + drivers/gpu/drm/i915/selftests/intel_lrc.c | 185 ++++++++++++++++++ drivers/gpu/drm/i915/selftests/lib_sw_fence.c | 3 + include/uapi/drm/i915_drm.h | 33 ++++ 9 files changed, 438 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index b387b71e2cb5..8e3b16462956 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1532,8 +1532,58 @@ set_engines__load_balance(struct i915_user_extension __user *base, void *data) return 0; } +static int +set_engines__bond(struct i915_user_extension __user *base, void *data) +{ + struct i915_context_engines_bond __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_engines *set = data; + unsigned int idx, class, instance; + struct intel_engine_cs *master; + u64 siblings; + int err; + + if (get_user(idx, &ext->virtual_index)) + return -EFAULT; + + if (idx >= set->num_engines) + return -EINVAL; + + idx = array_index_nospec(idx, set->num_engines); + if (!set->engines[idx]) + return -EINVAL; + + /* + * A non-virtual engine has 0 siblings to choose between; and submit + * fence will always be directed to the one engine. + */ + if (!intel_engine_is_virtual(set->engines[idx])) + return 0; + + err = check_user_mbz(&ext->mbz); + if (err) + return err; + + if (get_user(class, &ext->master_class)) + return -EFAULT; + + if (get_user(instance, &ext->master_instance)) + return -EFAULT; + + master = intel_engine_lookup_user(set->ctx->i915, class, instance); + if (!master) + return -EINVAL; + + if (get_user(siblings, &ext->sibling_mask)) + return -EFAULT; + + return intel_virtual_engine_attach_bond(set->engines[idx], + master, siblings); +} + static const i915_user_extension_fn set_engines__extensions[] = { [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance, + [I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond, }; static int diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 0a46f8113f5c..2d209519a6d5 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -743,6 +743,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) rq->batch = NULL; rq->capture_list = NULL; rq->waitboost = false; + INIT_ALL_ENGINES(rq->execution_mask); /* * Reserve space in the ring buffer for all the commands required to diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index d4f6b2940130..5bdab6881b13 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -32,6 +32,8 @@ #include "i915_selftest.h" #include "i915_sw_fence.h" +#include "intel_engine_types.h" + #include <uapi/drm/i915_drm.h> struct drm_file; @@ -145,6 +147,7 @@ struct i915_request { */ struct i915_sched_node sched; struct i915_dependency dep; + intel_engine_mask_t execution_mask; /* * A convenience pointer to the current breadcrumb value stored in diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h index 66ab1deeb7f5..2f148809f63c 100644 --- a/drivers/gpu/drm/i915/intel_engine_types.h +++ b/drivers/gpu/drm/i915/intel_engine_types.h @@ -391,6 +391,13 @@ struct intel_engine_cs { */ void (*submit_request)(struct i915_request *rq); + /* + * Called on signaling of a SUBMIT_FENCE, passing along the signaling + * request down to the bonded pairs. + */ + void (*bond_execute)(struct i915_request *rq, + struct dma_fence *signal); + /* * Call when the priority on a request has changed and it and its * dependencies may need rescheduling. Note the request itself may diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 926ef07f5c38..027c5eff2dce 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -191,6 +191,18 @@ struct virtual_engine { int prio; } nodes[I915_NUM_ENGINES]; + /* + * Keep track of bonded pairs -- restrictions upon on our selection + * of physical engines any particular request may be submitted to. + * If we receive a submit-fence from a master engine, we will only + * use one of sibling_mask physical engines. + */ + struct ve_bond { + struct intel_engine_cs *master; + intel_engine_mask_t sibling_mask; + } *bonds; + unsigned int num_bonds; + /* And finally, which physical engines this virtual engine maps onto. */ unsigned int count; struct intel_engine_cs *siblings[0]; @@ -818,6 +830,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine) continue; } + if (!(rq->execution_mask & engine->mask)) { + /* We peeked too soon! */ + rb = rb_next(rb); + continue; + } + /* * We track when the HW has completed saving the context image * (i.e. when we have seen the final CS event switching out of @@ -912,6 +930,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) rb = rb_first_cached(&execlists->virtual); continue; } + GEM_BUG_ON(!(rq->execution_mask & engine->mask)); if (rq_prio(rq) >= queue_prio(execlists)) { if (last && !can_merge_rq(last, rq)) { @@ -3154,6 +3173,8 @@ static void virtual_context_destroy(struct kref *kref) if (ve->context.state) __execlists_context_fini(&ve->context); + kfree(ve->bonds); + i915_timeline_fini(&ve->base.timeline); kfree(ve); } @@ -3205,9 +3226,30 @@ static const struct intel_context_ops virtual_context_ops = { .destroy = virtual_context_destroy, }; +static unsigned long virtual_submission_mask(struct virtual_engine *ve) +{ + struct i915_request *rq; + unsigned long mask; + + rq = READ_ONCE(ve->request); + if (!rq) + return 0; + + /* The rq is ready for submission; rq->execution_mask is now stable. */ + mask = rq->execution_mask; + if (unlikely(!mask)) { + /* Invalid selection, submit to a random engine in error */ + i915_request_skip(rq, -ENODEV); + mask = ve->siblings[0]->mask; + } + + return mask; +} + static void virtual_submission_tasklet(unsigned long data) { struct virtual_engine * const ve = (struct virtual_engine *)data; + unsigned long mask; unsigned int n; int prio; @@ -3215,6 +3257,12 @@ static void virtual_submission_tasklet(unsigned long data) if (prio == INT_MIN) return; + rcu_read_lock(); + mask = virtual_submission_mask(ve); + rcu_read_unlock(); + if (unlikely(!mask)) + return; + local_irq_disable(); for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) { struct intel_engine_cs *sibling = ve->siblings[n]; @@ -3222,6 +3270,17 @@ static void virtual_submission_tasklet(unsigned long data) struct rb_node **parent, *rb; bool first; + if (unlikely(!(mask & sibling->mask))) { + if (!RB_EMPTY_NODE(&node->rb)) { + spin_lock(&sibling->timeline.lock); + rb_erase_cached(&node->rb, + &sibling->execlists.virtual); + RB_CLEAR_NODE(&node->rb); + spin_unlock(&sibling->timeline.lock); + } + continue; + } + spin_lock(&sibling->timeline.lock); if (!RB_EMPTY_NODE(&node->rb)) { @@ -3284,6 +3343,37 @@ static void virtual_submit_request(struct i915_request *request) tasklet_schedule(&ve->base.execlists.tasklet); } +static struct ve_bond * +virtual_find_bond(struct virtual_engine *ve, struct intel_engine_cs *master) +{ + int i; + + for (i = 0; i < ve->num_bonds; i++) { + if (ve->bonds[i].master == master) + return &ve->bonds[i]; + } + + return NULL; +} + +static void +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + struct ve_bond *bond; + + bond = virtual_find_bond(ve, to_request(signal)->engine); + if (bond) { + intel_engine_mask_t old, new, cmp; + + cmp = READ_ONCE(rq->execution_mask); + do { + old = cmp; + new = cmp & bond->sibling_mask; + } while ((cmp = cmpxchg(&rq->execution_mask, old, new)) != old); + } +} + struct intel_engine_cs * intel_execlists_create_virtual(struct i915_gem_context *ctx, struct intel_engine_cs **siblings, @@ -3322,6 +3412,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx, ve->base.schedule = i915_schedule; ve->base.submit_request = virtual_submit_request; + ve->base.bond_execute = virtual_bond_execute; ve->base.execlists.queue_priority_hint = INT_MIN; tasklet_init(&ve->base.execlists.tasklet, @@ -3412,9 +3503,70 @@ intel_execlists_clone_virtual(struct i915_gem_context *ctx, if (IS_ERR(dst)) return dst; + if (se->num_bonds) { + struct virtual_engine *de = to_virtual_engine(dst); + + de->bonds = kmemdup(se->bonds, + sizeof(*se->bonds) * se->num_bonds, + GFP_KERNEL); + if (!de->bonds) { + intel_virtual_engine_destroy(dst); + return ERR_PTR(-ENOMEM); + } + + de->num_bonds = se->num_bonds; + } + return dst; } +static unsigned long +virtual_sibling_mask(struct virtual_engine *ve, unsigned long mask) +{ + unsigned long emask = 0; + int bit; + + for_each_set_bit(bit, &mask, ve->count) + emask |= ve->siblings[bit]->mask; + + return emask; +} + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + struct intel_engine_cs *master, + unsigned long mask) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + struct ve_bond *bond; + + if (mask >> ve->count) + return -EINVAL; + + mask = virtual_sibling_mask(ve, mask); + if (!mask) + return -EINVAL; + + bond = virtual_find_bond(ve, master); + if (bond) { + bond->sibling_mask |= mask; + return 0; + } + + bond = krealloc(ve->bonds, + sizeof(*bond) * (ve->num_bonds + 1), + GFP_KERNEL); + if (!bond) + return -ENOMEM; + + bond[ve->num_bonds].master = master; + bond[ve->num_bonds].sibling_mask = mask; + + ve->bonds = bond; + ve->num_bonds++; + + return 0; +} + void intel_virtual_engine_destroy(struct intel_engine_cs *engine) { struct virtual_engine *ve = to_virtual_engine(engine); diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 9d90dc68e02b..77b85648045a 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -121,6 +121,10 @@ struct intel_engine_cs * intel_execlists_clone_virtual(struct i915_gem_context *ctx, struct intel_engine_cs *src); +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + struct intel_engine_cs *master, + unsigned long mask); + void intel_virtual_engine_destroy(struct intel_engine_cs *engine); u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu); diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c index 6df033960350..bc8e13f80fb5 100644 --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c @@ -13,6 +13,7 @@ #include "igt_live_test.h" #include "igt_spinner.h" #include "i915_random.h" +#include "lib_sw_fence.h" #include "mock_context.h" @@ -1221,6 +1222,189 @@ static int live_virtual_engine(void *arg) return err; } +static int bond_virtual_engine(struct drm_i915_private *i915, + unsigned int class, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int flags) +#define BOND_SCHEDULE BIT(0) +{ + struct intel_engine_cs *master; + struct i915_gem_context *ctx; + struct i915_request *rq[16]; + enum intel_engine_id id; + unsigned long n; + int err; + + GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1); + + ctx = kernel_context(i915); + if (!ctx) + return -ENOMEM; + + err = 0; + rq[0] = ERR_PTR(-ENOMEM); + for_each_engine(master, i915, id) { + struct i915_sw_fence fence = {}; + + if (master->class == class) + continue; + + memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq)); + + rq[0] = i915_request_alloc(master, ctx); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + goto out; + } + i915_request_get(rq[0]); + + if (flags & BOND_SCHEDULE) { + onstack_fence_init(&fence); + err = i915_sw_fence_await_sw_fence_gfp(&rq[0]->submit, + &fence, + GFP_KERNEL); + } + i915_request_add(rq[0]); + if (err < 0) + goto out; + + for (n = 0; n < nsibling; n++) { + struct intel_engine_cs *engine; + + engine = intel_execlists_create_virtual(ctx, + siblings, + nsibling); + if (IS_ERR(engine)) { + err = PTR_ERR(engine); + onstack_fence_fini(&fence); + goto out; + } + + err = intel_virtual_engine_attach_bond(engine, + master, + BIT(n)); + if (err) { + intel_virtual_engine_destroy(engine); + onstack_fence_fini(&fence); + goto out; + } + + rq[n + 1] = i915_request_alloc(engine, ctx); + if (IS_ERR(rq[n + 1])) { + err = PTR_ERR(rq[n + 1]); + intel_virtual_engine_destroy(engine); + onstack_fence_fini(&fence); + goto out; + } + i915_request_get(rq[n + 1]); + + err = i915_request_await_execution(rq[n + 1], + &rq[0]->fence, + engine->bond_execute); + i915_request_add(rq[n + 1]); + intel_virtual_engine_destroy(engine); + if (err < 0) { + onstack_fence_fini(&fence); + goto out; + } + } + onstack_fence_fini(&fence); + + if (i915_request_wait(rq[0], + I915_WAIT_LOCKED, + HZ / 10) < 0) { + pr_err("Master request did not execute (on %s)!\n", + rq[0]->engine->name); + err = -EIO; + goto out; + } + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(rq[n + 1], + I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT) < 0) { + err = -EIO; + goto out; + } + + if (rq[n + 1]->engine != siblings[n]) { + pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n", + siblings[n]->name, + rq[n + 1]->engine->name, + rq[0]->engine->name); + err = -EINVAL; + goto out; + } + } + + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + rq[0] = ERR_PTR(-ENOMEM); + } + +out: + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + kernel_context_close(ctx); + return err; +} + +static int live_virtual_bond(void *arg) +{ + static const struct phase { + const char *name; + unsigned int flags; + } phases[] = { + { "", 0 }, + { "schedule", BOND_SCHEDULE }, + { }, + }; + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class, inst; + int err = 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + const struct phase *p; + int nsibling; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + break; + + GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings)); + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (p = phases; p->name; p++) { + err = bond_virtual_engine(i915, + class, siblings, nsibling, + p->flags); + if (err) { + pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n", + __func__, p->name, class, nsibling, err); + goto out_unlock; + } + } + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { @@ -1233,6 +1417,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_preempt_hang), SUBTEST(live_preempt_smoke), SUBTEST(live_virtual_engine), + SUBTEST(live_virtual_bond), }; if (!HAS_EXECLISTS(i915)) diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c index 2bfa72c1654b..b976c12817c5 100644 --- a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c +++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c @@ -45,6 +45,9 @@ void __onstack_fence_init(struct i915_sw_fence *fence, void onstack_fence_fini(struct i915_sw_fence *fence) { + if (!fence->flags) + return; + i915_sw_fence_commit(fence); i915_sw_fence_fini(fence); } diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 9c94c037d13b..0d9ca4fb9edb 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1532,6 +1532,10 @@ struct drm_i915_gem_context_param { * sized argument, will revert back to default settings. * * See struct i915_context_param_engines. + * + * Extensions: + * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) + * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) */ #define I915_CONTEXT_PARAM_ENGINES 0xa /* Must be kept compact -- no holes and well documented */ @@ -1627,9 +1631,38 @@ struct i915_context_engines_load_balance { __u64 mbz64[4]; /* reserved for future use; must be zero */ }; +/* + * i915_context_engines_bond: + * + * Constructed bonded pairs for execution within a virtual engine. + * + * All engines are equal, but some are more equal than others. Given + * the distribution of resources in the HW, it may be preferable to run + * a request on a given subset of engines in parallel to a request on a + * specific engine. We enable this selection of engines within a virtual + * engine by specifying bonding pairs, for any given master engine we will + * only execute on one of the corresponding siblings within the virtual engine. + * + * To execute a request in parallel on the master engine and a sibling requires + * coordination with a I915_EXEC_FENCE_SUBMIT. + */ +struct i915_context_engines_bond { + struct i915_user_extension base; + + __u16 virtual_index; /* index of virtual engine in ctx->engines[] */ + __u16 mbz; + + __u16 master_class; + __u16 master_instance; + + __u64 sibling_mask; /* bitmask of BIT(sibling_index) wrt the v.engine */ + __u64 flags; /* all undefined flags must be zero */ +}; + struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 +#define I915_CONTEXT_ENGINES_EXT_BOND 1 struct { __u16 engine_class; /* see enum drm_i915_gem_engine_class */ -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx