On Fri, Sep 10, 2021 at 12:25:43PM +0100, Tvrtko Ursulin wrote: > > On 20/08/2021 23:44, Matthew Brost wrote: > > For some users of multi-lrc, e.g. split frame, it isn't safe to preempt > > mid BB. To safely enable preemption at the BB boundary, a handshake > > between to parent and child is needed. This is implemented via custom > > emit_bb_start & emit_fini_breadcrumb functions and enabled via by > > default if a context is configured by set parallel extension. > > FWIW I think it's wrong to hardcode the requirements of a particular > hardware generation fixed media pipeline into the uapi. IMO better solution > was when concept of parallel submission was decoupled from the no preemption > mid batch preambles. Otherwise might as well call the extension > I915_CONTEXT_ENGINES_EXT_MEDIA_SPLIT_FRAME_SUBMIT or something. > I don't disagree but this where we landed per Daniel Vetter's feedback - default to what our current hardware supports and extend it later to newer hardware / requirements as needed. Matt > Regards, > > Tvrtko > > Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/gt/intel_context.c | 2 +- > > drivers/gpu/drm/i915/gt/intel_context_types.h | 3 + > > drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 2 +- > > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++- > > 4 files changed, 287 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c > > index 5615be32879c..2de62649e275 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_context.c > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c > > @@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent, > > GEM_BUG_ON(intel_context_is_child(child)); > > GEM_BUG_ON(intel_context_is_parent(child)); > > - parent->guc_number_children++; > > + child->guc_child_index = parent->guc_number_children++; > > list_add_tail(&child->guc_child_link, > > &parent->guc_child_list); > > child->parent = parent; > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > > index 713d85b0b364..727f91e7f7c2 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > > @@ -246,6 +246,9 @@ struct intel_context { > > /** @guc_number_children: number of children if parent */ > > u8 guc_number_children; > > + /** @guc_child_index: index into guc_child_list if child */ > > + u8 guc_child_index; > > + > > /** > > * @parent_page: page in context used by parent for work queue, > > * work queue descriptor > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h > > index 6cd26dc060d1..9f61cfa5566a 100644 > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h > > @@ -188,7 +188,7 @@ struct guc_process_desc { > > u32 wq_status; > > u32 engine_presence; > > u32 priority; > > - u32 reserved[30]; > > + u32 reserved[36]; > > } __packed; > > #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > index 91330525330d..1a18f99bf12a 100644 > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > @@ -11,6 +11,7 @@ > > #include "gt/intel_context.h" > > #include "gt/intel_engine_pm.h" > > #include "gt/intel_engine_heartbeat.h" > > +#include "gt/intel_gpu_commands.h" > > #include "gt/intel_gt.h" > > #include "gt/intel_gt_irq.h" > > #include "gt/intel_gt_pm.h" > > @@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb) > > /* > > * When using multi-lrc submission an extra page in the context state is > > - * reserved for the process descriptor and work queue. > > + * reserved for the process descriptor, work queue, and preempt BB boundary > > + * handshake between the parent + childlren contexts. > > * > > * The layout of this page is below: > > * 0 guc_process_desc > > + * + sizeof(struct guc_process_desc) child go > > + * + CACHELINE_BYTES child join ... > > + * + CACHELINE_BYTES ... > > * ... unused > > * PAGE_SIZE / 2 work queue start > > * ... work queue > > @@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop) > > return __guc_action_deregister_context(guc, guc_id, loop); > > } > > +static inline void clear_children_join_go_memory(struct intel_context *ce) > > +{ > > + u32 *mem = (u32 *)(__get_process_desc(ce) + 1); > > + u8 i; > > + > > + for (i = 0; i < ce->guc_number_children + 1; ++i) > > + mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0; > > +} > > + > > +static inline u32 get_children_go_value(struct intel_context *ce) > > +{ > > + u32 *mem = (u32 *)(__get_process_desc(ce) + 1); > > + > > + return mem[0]; > > +} > > + > > +static inline u32 get_children_join_value(struct intel_context *ce, > > + u8 child_index) > > +{ > > + u32 *mem = (u32 *)(__get_process_desc(ce) + 1); > > + > > + return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))]; > > +} > > + > > static void guc_context_policy_init(struct intel_engine_cs *engine, > > struct guc_lrc_desc *desc) > > { > > @@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) > > desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; > > guc_context_policy_init(engine, desc); > > } > > + > > + clear_children_join_go_memory(ce); > > } > > /* > > @@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = { > > .get_sibling = guc_virtual_get_sibling, > > }; > > +/* > > + * The below override of the breadcrumbs is enabled when the user configures a > > + * context for parallel submission (multi-lrc, parent-child). > > + * > > + * The overridden breadcrumbs implements an algorithm which allows the GuC to > > + * safely preempt all the hw contexts configured for parallel submission > > + * between each BB. The contract between the i915 and GuC is if the parent > > + * context can be preempted, all the children can be preempted, and the GuC will > > + * always try to preempt the parent before the children. A handshake between the > > + * parent / children breadcrumbs ensures the i915 holds up its end of the deal > > + * creating a window to preempt between each set of BBs. > > + */ > > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, > > + u64 offset, u32 len, > > + const unsigned int flags); > > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, > > + u64 offset, u32 len, > > + const unsigned int flags); > > +static u32 * > > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, > > + u32 *cs); > > +static u32 * > > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, > > + u32 *cs); > > + > > static struct intel_context * > > guc_create_parallel(struct intel_engine_cs **engines, > > unsigned int num_siblings, > > @@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines, > > } > > } > > + parent->engine->emit_bb_start = > > + emit_bb_start_parent_no_preempt_mid_batch; > > + parent->engine->emit_fini_breadcrumb = > > + emit_fini_breadcrumb_parent_no_preempt_mid_batch; > > + parent->engine->emit_fini_breadcrumb_dw = > > + 12 + 4 * parent->guc_number_children; > > + for_each_child(parent, ce) { > > + ce->engine->emit_bb_start = > > + emit_bb_start_child_no_preempt_mid_batch; > > + ce->engine->emit_fini_breadcrumb = > > + emit_fini_breadcrumb_child_no_preempt_mid_batch; > > + ce->engine->emit_fini_breadcrumb_dw = 16; > > + } > > + > > kfree(siblings); > > return parent; > > @@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc) > > guc->submission_selected = __guc_submission_selected(guc); > > } > > +static inline u32 get_children_go_addr(struct intel_context *ce) > > +{ > > + GEM_BUG_ON(!intel_context_is_parent(ce)); > > + > > + return i915_ggtt_offset(ce->state) + > > + __get_process_desc_offset(ce) + > > + sizeof(struct guc_process_desc); > > +} > > + > > +static inline u32 get_children_join_addr(struct intel_context *ce, > > + u8 child_index) > > +{ > > + GEM_BUG_ON(!intel_context_is_parent(ce)); > > + > > + return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES; > > +} > > + > > +#define PARENT_GO_BB 1 > > +#define PARENT_GO_FINI_BREADCRUMB 0 > > +#define CHILD_GO_BB 1 > > +#define CHILD_GO_FINI_BREADCRUMB 0 > > +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, > > + u64 offset, u32 len, > > + const unsigned int flags) > > +{ > > + struct intel_context *ce = rq->context; > > + u32 *cs; > > + u8 i; > > + > > + GEM_BUG_ON(!intel_context_is_parent(ce)); > > + > > + cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children); > > + if (IS_ERR(cs)) > > + return PTR_ERR(cs); > > + > > + /* Wait on chidlren */ > > + for (i = 0; i < ce->guc_number_children; ++i) { > > + *cs++ = (MI_SEMAPHORE_WAIT | > > + MI_SEMAPHORE_GLOBAL_GTT | > > + MI_SEMAPHORE_POLL | > > + MI_SEMAPHORE_SAD_EQ_SDD); > > + *cs++ = PARENT_GO_BB; > > + *cs++ = get_children_join_addr(ce, i); > > + *cs++ = 0; > > + } > > + > > + /* Turn off preemption */ > > + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; > > + *cs++ = MI_NOOP; > > + > > + /* Tell children go */ > > + cs = gen8_emit_ggtt_write(cs, > > + CHILD_GO_BB, > > + get_children_go_addr(ce), > > + 0); > > + > > + /* Jump to batch */ > > + *cs++ = MI_BATCH_BUFFER_START_GEN8 | > > + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); > > + *cs++ = lower_32_bits(offset); > > + *cs++ = upper_32_bits(offset); > > + *cs++ = MI_NOOP; > > + > > + intel_ring_advance(rq, cs); > > + > > + return 0; > > +} > > + > > +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, > > + u64 offset, u32 len, > > + const unsigned int flags) > > +{ > > + struct intel_context *ce = rq->context; > > + u32 *cs; > > + > > + GEM_BUG_ON(!intel_context_is_child(ce)); > > + > > + cs = intel_ring_begin(rq, 12); > > + if (IS_ERR(cs)) > > + return PTR_ERR(cs); > > + > > + /* Signal parent */ > > + cs = gen8_emit_ggtt_write(cs, > > + PARENT_GO_BB, > > + get_children_join_addr(ce->parent, > > + ce->guc_child_index), > > + 0); > > + > > + /* Wait parent on for go */ > > + *cs++ = (MI_SEMAPHORE_WAIT | > > + MI_SEMAPHORE_GLOBAL_GTT | > > + MI_SEMAPHORE_POLL | > > + MI_SEMAPHORE_SAD_EQ_SDD); > > + *cs++ = CHILD_GO_BB; > > + *cs++ = get_children_go_addr(ce->parent); > > + *cs++ = 0; > > + > > + /* Turn off preemption */ > > + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; > > + > > + /* Jump to batch */ > > + *cs++ = MI_BATCH_BUFFER_START_GEN8 | > > + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); > > + *cs++ = lower_32_bits(offset); > > + *cs++ = upper_32_bits(offset); > > + > > + intel_ring_advance(rq, cs); > > + > > + return 0; > > +} > > + > > +static u32 * > > +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, > > + u32 *cs) > > +{ > > + struct intel_context *ce = rq->context; > > + u8 i; > > + > > + GEM_BUG_ON(!intel_context_is_parent(ce)); > > + > > + /* Wait on children */ > > + for (i = 0; i < ce->guc_number_children; ++i) { > > + *cs++ = (MI_SEMAPHORE_WAIT | > > + MI_SEMAPHORE_GLOBAL_GTT | > > + MI_SEMAPHORE_POLL | > > + MI_SEMAPHORE_SAD_EQ_SDD); > > + *cs++ = PARENT_GO_FINI_BREADCRUMB; > > + *cs++ = get_children_join_addr(ce, i); > > + *cs++ = 0; > > + } > > + > > + /* Turn on preemption */ > > + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > > + *cs++ = MI_NOOP; > > + > > + /* Tell children go */ > > + cs = gen8_emit_ggtt_write(cs, > > + CHILD_GO_FINI_BREADCRUMB, > > + get_children_go_addr(ce), > > + 0); > > + > > + /* Emit fini breadcrumb */ > > + cs = gen8_emit_ggtt_write(cs, > > + rq->fence.seqno, > > + i915_request_active_timeline(rq)->hwsp_offset, > > + 0); > > + > > + /* User interrupt */ > > + *cs++ = MI_USER_INTERRUPT; > > + *cs++ = MI_NOOP; > > + > > + rq->tail = intel_ring_offset(rq, cs); > > + > > + return cs; > > +} > > + > > +static u32 * > > +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs) > > +{ > > + struct intel_context *ce = rq->context; > > + > > + GEM_BUG_ON(!intel_context_is_child(ce)); > > + > > + /* Turn on preemption */ > > + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > > + *cs++ = MI_NOOP; > > + > > + /* Signal parent */ > > + cs = gen8_emit_ggtt_write(cs, > > + PARENT_GO_FINI_BREADCRUMB, > > + get_children_join_addr(ce->parent, > > + ce->guc_child_index), > > + 0); > > + > > + /* Wait parent on for go */ > > + *cs++ = (MI_SEMAPHORE_WAIT | > > + MI_SEMAPHORE_GLOBAL_GTT | > > + MI_SEMAPHORE_POLL | > > + MI_SEMAPHORE_SAD_EQ_SDD); > > + *cs++ = CHILD_GO_FINI_BREADCRUMB; > > + *cs++ = get_children_go_addr(ce->parent); > > + *cs++ = 0; > > + > > + /* Emit fini breadcrumb */ > > + cs = gen8_emit_ggtt_write(cs, > > + rq->fence.seqno, > > + i915_request_active_timeline(rq)->hwsp_offset, > > + 0); > > + > > + /* User interrupt */ > > + *cs++ = MI_USER_INTERRUPT; > > + *cs++ = MI_NOOP; > > + > > + rq->tail = intel_ring_offset(rq, cs); > > + > > + return cs; > > +} > > + > > static struct intel_context * > > g2h_context_lookup(struct intel_guc *guc, u32 desc_idx) > > { > > @@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, > > drm_printf(p, "\t\tWQI Status: %u\n\n", > > READ_ONCE(desc->wq_status)); > > + drm_printf(p, "\t\tNumber Children: %u\n\n", > > + ce->guc_number_children); > > + if (ce->engine->emit_bb_start == > > + emit_bb_start_parent_no_preempt_mid_batch) { > > + u8 i; > > + > > + drm_printf(p, "\t\tChildren Go: %u\n\n", > > + get_children_go_value(ce)); > > + for (i = 0; i < ce->guc_number_children; ++i) > > + drm_printf(p, "\t\tChildren Join: %u\n", > > + get_children_join_value(ce, i)); > > + } > > + > > for_each_child(ce, child) > > guc_log_context(p, child); > > } > >