Some users require that when a master batch is executed on one particular engine, a companion batch is run simultaneously on a specific slave engine. For this purpose, we introduce virtual engine bonding, allowing maps of master:slaves to be constructed to constrain which physical engines a virtual engine may select given a fence on a master engine. For the moment, we continue to ignore the issue of preemption deferring the master request for later. Ideally, we would like to then also remove the slave and run something else rather than have it stall the pipeline. With load balancing, we should be able to move workload around it, but there is a similar stall on the master pipeline while it may wait for the slave to be executed. At the cost of more latency for the bonded request, it may be interesting to launch both on their engines in lockstep. (Bubbles abound.) Opens: Also what about bonding an engine as its own master? It doesn't break anything internally, so allow the silliness. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem_context.c | 50 ++++++ drivers/gpu/drm/i915/i915_request.c | 1 + drivers/gpu/drm/i915/i915_request.h | 1 + drivers/gpu/drm/i915/intel_engine_types.h | 7 + drivers/gpu/drm/i915/intel_lrc.c | 111 ++++++++++++++ drivers/gpu/drm/i915/intel_lrc.h | 4 + drivers/gpu/drm/i915/selftests/intel_lrc.c | 167 +++++++++++++++++++++ include/uapi/drm/i915_drm.h | 22 +++ 8 files changed, 363 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 13b79980f7f3..0d86306497b8 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1484,8 +1484,58 @@ set_engines__load_balance(struct i915_user_extension __user *base, void *data) return 0; } +static int +set_engines__bond(struct i915_user_extension __user *base, void *data) +{ + struct i915_context_engines_bond __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_engines *set = data; + struct intel_engine_cs *master; + u32 class, instance, siblings; + u16 idx; + int err; + + if (get_user(idx, &ext->engine_index)) + return -EFAULT; + + if (idx >= set->nengine) + return -EINVAL; + + idx = array_index_nospec(idx, set->nengine); + if (!set->engines[idx]) + return -EINVAL; + + /* + * A non-virtual engine has 0 siblings to choose between; and submit + * fence will always be directed to the one engine. + */ + if (!intel_engine_is_virtual(set->engines[idx])) + return 0; + + err = check_user_mbz16(&ext->mbz); + if (err) + return err; + + if (get_user(class, &ext->master_class)) + return -EFAULT; + + if (get_user(instance, &ext->master_instance)) + return -EFAULT; + + master = intel_engine_lookup_user(set->ctx->i915, class, instance); + if (!master) + return -EINVAL; + + if (get_user(siblings, &ext->sibling_mask)) + return -EFAULT; + + return intel_virtual_engine_attach_bond(set->engines[idx], + master, siblings); +} + static const i915_user_extension_fn set_engines__extensions[] = { [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance, + [I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond, }; static int diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 5527ab22dbf2..0caf31de2b98 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -743,6 +743,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) rq->batch = NULL; rq->capture_list = NULL; rq->waitboost = false; + rq->execution_mask = ~0u; /* * Reserve space in the ring buffer for all the commands required to diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index d4f6b2940130..862b25930de0 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -145,6 +145,7 @@ struct i915_request { */ struct i915_sched_node sched; struct i915_dependency dep; + unsigned int execution_mask; /* * A convenience pointer to the current breadcrumb value stored in diff --git a/drivers/gpu/drm/i915/intel_engine_types.h b/drivers/gpu/drm/i915/intel_engine_types.h index d54d2a1840cc..6dfcf5cc08c1 100644 --- a/drivers/gpu/drm/i915/intel_engine_types.h +++ b/drivers/gpu/drm/i915/intel_engine_types.h @@ -382,6 +382,13 @@ struct intel_engine_cs { */ void (*submit_request)(struct i915_request *rq); + /* + * Called on signaling of a SUBMIT_FENCE, passing along the signaling + * request down to the bonded pairs. + */ + void (*bond_execute)(struct i915_request *rq, + struct dma_fence *signal); + /* * Call when the priority on a request has changed and it and its * dependencies may need rescheduling. Note the request itself may diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 0c97e8f30223..f06312d185af 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -179,6 +179,12 @@ struct virtual_engine { int prio; } nodes[I915_NUM_ENGINES]; + struct ve_bond { + struct intel_engine_cs *master; + unsigned int sibling_mask; + } *bonds; + unsigned int nbond; + unsigned int count; struct intel_engine_cs *siblings[0]; }; @@ -3183,6 +3189,7 @@ static const struct intel_context_ops virtual_context_ops = { static void virtual_submission_tasklet(unsigned long data) { struct virtual_engine * const ve = (struct virtual_engine *)data; + unsigned int mask; unsigned int n; int prio; @@ -3191,12 +3198,30 @@ static void virtual_submission_tasklet(unsigned long data) return; local_irq_disable(); + + mask = 0; + spin_lock(&ve->base.timeline.lock); + if (ve->request) + mask = ve->request->execution_mask; + spin_unlock(&ve->base.timeline.lock); + for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) { struct intel_engine_cs *sibling = ve->siblings[n]; struct ve_node * const node = &ve->nodes[sibling->id]; struct rb_node **parent, *rb; bool first; + if (unlikely(!(mask & sibling->mask))) { + if (!RB_EMPTY_NODE(&node->rb)) { + spin_lock(&sibling->timeline.lock); + rb_erase_cached(&node->rb, + &sibling->execlists.virtual); + RB_CLEAR_NODE(&node->rb); + spin_unlock(&sibling->timeline.lock); + } + continue; + } + spin_lock(&sibling->timeline.lock); if (!RB_EMPTY_NODE(&node->rb)) { @@ -3254,6 +3279,30 @@ static void virtual_submit_request(struct i915_request *request) tasklet_schedule(&ve->base.execlists.tasklet); } +static struct ve_bond * +virtual_find_bond(struct virtual_engine *ve, struct intel_engine_cs *master) +{ + int i; + + for (i = 0; i < ve->nbond; i++) { + if (ve->bonds[i].master == master) + return &ve->bonds[i]; + } + + return NULL; +} + +static void +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + struct ve_bond *bond; + + bond = virtual_find_bond(ve, to_request(signal)->engine); + if (bond) /* XXX serialise with rq->lock? */ + rq->execution_mask &= bond->sibling_mask; +} + struct intel_engine_cs * intel_execlists_create_virtual(struct i915_gem_context *ctx, struct intel_engine_cs **siblings, @@ -3294,6 +3343,7 @@ intel_execlists_create_virtual(struct i915_gem_context *ctx, ve->base.schedule = i915_schedule; ve->base.submit_request = virtual_submit_request; + ve->base.bond_execute = virtual_bond_execute; ve->base.execlists.queue_priority_hint = INT_MIN; tasklet_init(&ve->base.execlists.tasklet, @@ -3369,9 +3419,70 @@ intel_execlists_clone_virtual(struct i915_gem_context *ctx, if (IS_ERR(dst)) return dst; + if (se->nbond) { + struct virtual_engine *de = to_virtual_engine(dst); + + de->bonds = kmemdup(se->bonds, + sizeof(*se->bonds) * se->nbond, + GFP_KERNEL); + if (!de->bonds) { + intel_virtual_engine_destroy(dst); + return ERR_PTR(-ENOMEM); + } + + de->nbond = se->nbond; + } + return dst; } +static unsigned long +virtual_execution_mask(struct virtual_engine *ve, unsigned long mask) +{ + unsigned long emask = 0; + int bit; + + for_each_set_bit(bit, &mask, ve->count) + emask |= ve->siblings[bit]->mask; + + return emask; +} + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + struct intel_engine_cs *master, + unsigned long mask) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + struct ve_bond *bond; + + if (mask >> ve->count) + return -EINVAL; + + mask = virtual_execution_mask(ve, mask); + if (!mask) + return -EINVAL; + + bond = virtual_find_bond(ve, master); + if (bond) { + bond->sibling_mask |= mask; + return 0; + } + + bond = krealloc(ve->bonds, + sizeof(*bond) * (ve->nbond + 1), + GFP_KERNEL); + if (!bond) + return -ENOMEM; + + bond[ve->nbond].master = master; + bond[ve->nbond].sibling_mask = mask; + + ve->bonds = bond; + ve->nbond++; + + return 0; +} + void intel_virtual_engine_destroy(struct intel_engine_cs *engine) { struct virtual_engine *ve = to_virtual_engine(engine); diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 9d90dc68e02b..77b85648045a 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -121,6 +121,10 @@ struct intel_engine_cs * intel_execlists_clone_virtual(struct i915_gem_context *ctx, struct intel_engine_cs *src); +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + struct intel_engine_cs *master, + unsigned long mask); + void intel_virtual_engine_destroy(struct intel_engine_cs *engine); u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu); diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c index 4b8a339529d1..a7de7a8fc24a 100644 --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c @@ -13,6 +13,7 @@ #include "igt_live_test.h" #include "igt_spinner.h" #include "i915_random.h" +#include "lib_sw_fence.h" #include "mock_context.h" @@ -1224,6 +1225,171 @@ static int live_virtual_engine(void *arg) return err; } +static int bond_virtual_engine(struct drm_i915_private *i915, + unsigned int class, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int flags) +#define BOND_SCHEDULE BIT(0) +{ + struct intel_engine_cs *master; + struct i915_gem_context *ctx; + struct i915_request *rq[16]; + enum intel_engine_id id; + unsigned long n; + int err; + + GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1); + + ctx = kernel_context(i915); + if (!ctx) + return -ENOMEM; + + err = 0; + rq[0] = ERR_PTR(-ENOMEM); + for_each_engine(master, i915, id) { + struct i915_sw_fence fence; + + if (master->class == class) + continue; + + rq[0] = i915_request_alloc(master, ctx); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + goto out; + } + + if (flags & BOND_SCHEDULE) + onstack_fence_init(&fence); + + i915_request_get(rq[0]); + i915_request_add(rq[0]); + + for (n = 0; n < nsibling; n++) { + struct intel_engine_cs *engine; + + engine = intel_execlists_create_virtual(ctx, + siblings, + nsibling); + if (IS_ERR(engine)) { + err = PTR_ERR(engine); + goto out; + } + + err = intel_virtual_engine_attach_bond(engine, + master, + BIT(n)); + if (err) { + intel_virtual_engine_destroy(engine); + goto out; + } + + rq[n + 1] = i915_request_alloc(engine, ctx); + if (IS_ERR(rq[n + 1])) { + err = PTR_ERR(rq[n + 1]); + intel_virtual_engine_destroy(engine); + goto out; + } + i915_request_get(rq[n + 1]); + + err = i915_request_await_execution(rq[n + 1], + &rq[0]->fence, + engine->bond_execute); + i915_request_add(rq[n + 1]); + intel_virtual_engine_destroy(engine); + if (err < 0) + goto out; + } + rq[n + 1] = ERR_PTR(-EINVAL); + + if (flags & BOND_SCHEDULE) + onstack_fence_fini(&fence); + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(rq[n + 1], + I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT) < 0) { + err = -EIO; + goto out; + } + + if (rq[n + 1]->engine != siblings[n]) { + pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n", + siblings[n]->name, + rq[n + 1]->engine->name, + rq[0]->engine->name); + err = -EINVAL; + goto out; + } + } + + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + rq[0] = ERR_PTR(-ENOMEM); + } + +out: + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + kernel_context_close(ctx); + return err; +} + +static int live_virtual_bond(void *arg) +{ + static const struct phase { + const char *name; + unsigned int flags; + } phases[] = { + { "", 0 }, + { "schedule", BOND_SCHEDULE }, + { }, + }; + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class, inst; + int err = 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + const struct phase *p; + int nsibling; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + break; + + GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings)); + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (p = phases; p->name; p++) { + err = bond_virtual_engine(i915, + class, siblings, nsibling, + p->flags); + if (err) { + pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n", + __func__, p->name, class, nsibling, err); + goto out_unlock; + } + } + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { @@ -1236,6 +1402,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_preempt_hang), SUBTEST(live_preempt_smoke), SUBTEST(live_virtual_engine), + SUBTEST(live_virtual_bond), }; if (!HAS_EXECLISTS(i915)) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 592b02676044..94e72ae954a0 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1530,6 +1530,10 @@ struct drm_i915_gem_context_param { * sized argument, will revert back to default settings. * * See struct i915_context_param_engines. + * + * Extensions: + * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) + * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) */ #define I915_CONTEXT_PARAM_ENGINES 0xa /* Must be kept compact -- no holes and well documented */ @@ -1625,9 +1629,27 @@ struct i915_context_engines_load_balance { __u64 mbz64[4]; /* reserved for future use; must be zero */ }; +/* + * i915_context_engines_bond: + * + */ +struct i915_context_engines_bond { + struct i915_user_extension base; + + __u16 engine_index; + __u16 mbz; + + __u16 master_class; + __u16 master_instance; + + __u64 sibling_mask; + __u64 flags; /* all undefined flags must be zero */ +}; + struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 +#define I915_CONTEXT_ENGINES_EXT_BOND 1 struct { __u16 engine_class; /* see enum drm_i915_gem_engine_class */ -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx