On Wed, 26 Apr 2023 17:11:27 -0700, Umesh Nerlige Ramappa wrote: > Hi Umesh, > On Mon, Apr 24, 2023 at 10:41:41AM -0700, Dixit, Ashutosh wrote: > > On Tue, 04 Apr 2023 17:14:32 -0700, Umesh Nerlige Ramappa wrote: > > > >> GPU accumulates the context runtime in a 32 bit counter - CTX_TIMESTAMP > >> in the context image. This value is saved/restored on context switches. > >> KMD accumulates these values into a 64 bit counter taking care of any > >> overflows as needed. This count provides the basis for client specific > >> busyness in the fdinfo interface. > >> > >> KMD accumulation happens just before the context is unpinned and when > >> context switches out. This works for execlist back-end since execlist > >> scheduling has visibility into context switches. With GuC mode, KMD does > >> not have visibility into context switches and this counter is > >> accumulated only when context is unpinned. Context is unpinned once the > >> context scheduling is successfully disabled. Disabling context > >> scheduling is an asynchronous operation. > > > > This means guc_context_unpin() is called asynchronously, correct? From > > guc_context_sched_disable()? > > correct > > > >> Also if a context is servicing frequent requests, scheduling may never be > >> disabled on it. > >> > >> For GuC mode, since updates to the context runtime may be delayed, add > >> hooks to update the context runtime in a worker thread as well as when > >> a user queries for it. > >> > >> Limitation: > >> - If a context is never switched out or runs for a long period of time, > >> the runtime value of CTX_TIMESTAMP may never be updated, so the > >> counter value may be unreliable. This patch does not support such > >> cases. Such support must be available from the GuC FW and it is WIP. > >> > >> This patch is an extract from previous work authored by John/Umesh here - > >> https://patchwork.freedesktop.org/patch/496441/?series=105085&rev=4 > >> > >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@xxxxxxxxx> > >> Co-developed-by: John Harrison <John.C.Harrison@xxxxxxxxx> > >> Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx> > >> --- > >> drivers/gpu/drm/i915/gt/intel_context.c | 12 +++++-- > >> drivers/gpu/drm/i915/gt/intel_context.h | 2 +- > >> drivers/gpu/drm/i915/gt/intel_context_types.h | 5 +++ > >> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 33 +++++++++++++++++++ > >> 4 files changed, 49 insertions(+), 3 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c > >> index 2aa63ec521b8..e01f222e9e42 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_context.c > >> @@ -578,16 +578,24 @@ void intel_context_bind_parent_child(struct intel_context *parent, > >> child->parallel.parent = parent; > >> } > >> > >> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce) > >> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce) > >> { > >> u64 total, active; > >> > >> + if (ce->ops->update_stats) > >> + ce->ops->update_stats(ce); > >> + > >> total = ce->stats.runtime.total; > >> if (ce->ops->flags & COPS_RUNTIME_CYCLES) > >> total *= ce->engine->gt->clock_period_ns; > >> > >> active = READ_ONCE(ce->stats.active); > >> - if (active) > >> + /* > >> + * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend > >> + * already provides the total active time of the context, > > > > Where is this done in the GuC case? I looked but couldn't find it (at least > > in this version of the patch, it is there in the old version). > > In this case, the backend is not providing the total active time, I guess I > should drop this logic provided ce->stats.active is 0 for GuC mode. Yes, I think best to skip the active part in this patch since this is a temporary/inaccurate solution. > > > > >> + * so skip this calculation when this flag is set. > >> + */ > >> + if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL)) > >> active = intel_context_clock() - active; > >> > >> return total + active; > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h > >> index 0a8d553da3f4..720809523e2d 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_context.h > >> @@ -368,7 +368,7 @@ intel_context_clear_nopreempt(struct intel_context *ce) > >> clear_bit(CONTEXT_NOPREEMPT, &ce->flags); > >> } > >> > >> -u64 intel_context_get_total_runtime_ns(const struct intel_context *ce); > >> +u64 intel_context_get_total_runtime_ns(struct intel_context *ce); > >> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce); > >> > >> static inline u64 intel_context_clock(void) > >> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> index e36670f2e626..58b0294d359d 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > >> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > >> @@ -38,6 +38,9 @@ struct intel_context_ops { > >> #define COPS_RUNTIME_CYCLES_BIT 1 > >> #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT) > >> > >> +#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2 > >> +#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT) > >> + > >> int (*alloc)(struct intel_context *ce); > >> > >> void (*revoke)(struct intel_context *ce, struct i915_request *rq, > >> @@ -58,6 +61,8 @@ struct intel_context_ops { > >> > >> void (*sched_disable)(struct intel_context *ce); > >> > >> + void (*update_stats)(struct intel_context *ce); > >> + > >> void (*reset)(struct intel_context *ce); > >> void (*destroy)(struct kref *kref); > >> > >> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > >> index 88e881b100cf..8048a3e97a68 100644 > >> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > >> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > >> @@ -1402,13 +1402,25 @@ static void __update_guc_busyness_stats(struct intel_guc *guc) > >> spin_unlock_irqrestore(&guc->timestamp.lock, flags); > >> } > >> > >> +static void guc_context_update_clks(struct intel_context *ce) > >> +{ > >> + struct intel_guc *guc = ce_to_guc(ce); > >> + unsigned long flags; > >> + > >> + spin_lock_irqsave(&guc->timestamp.lock, flags); > >> + lrc_update_runtime(ce); > >> + spin_unlock_irqrestore(&guc->timestamp.lock, flags); > >> +} > >> + > >> static void guc_timestamp_ping(struct work_struct *wrk) > >> { > >> struct intel_guc *guc = container_of(wrk, typeof(*guc), > >> timestamp.work.work); > >> struct intel_uc *uc = container_of(guc, typeof(*uc), guc); > >> struct intel_gt *gt = guc_to_gt(guc); > >> + struct intel_context *ce; > >> intel_wakeref_t wakeref; > >> + unsigned long index; > >> int srcu, ret; > >> > >> /* > >> @@ -1424,6 +1436,10 @@ static void guc_timestamp_ping(struct work_struct *wrk) > >> with_intel_runtime_pm(>->i915->runtime_pm, wakeref) > >> __update_guc_busyness_stats(guc); > > > > How do we know the context images are pinned at this point which is needed > > for the code below? Where is the pinning happening? > > Ah, maybe this should just call - guc_context_update_stats() > > > >> + /* adjust context stats for overflow */ > >> + xa_for_each(&guc->context_lookup, index, ce) > >> + guc_context_update_clks(ce); > > > > Here are we saying that we need to do this because the context can get > > switched out (and context image saved) and back in multiple times without > > the context getting unpinned? Otherwise guc_context_unpin() would call > > lrc_update_runtime() and we wouldn't have to do this here. > > Mainly for 32 bit overflows. The busyness value obtained from the context > image is a 32 bit value and could wrap around. If we keep grabbing it > periodically and accumulate it in the 64 bit value in ce stats, we should > be good. But it will not help unless the busyness value in context image is being updated. Which will only happen if the context switches. So that is why I was saying this makes sense only if context switches are happening but not context unpins (which would anyway accumulate the values in lrc_update_runtime). > > > > >> + > >> intel_gt_reset_unlock(gt, srcu); > >> > >> guc_enable_busyness_worker(guc); > >> @@ -1505,6 +1521,17 @@ void intel_guc_busyness_unpark(struct intel_gt *gt) > >> guc_enable_busyness_worker(guc); > >> } > >> > >> +static void guc_context_update_stats(struct intel_context *ce) > >> +{ > >> + if (!intel_context_pin_if_active(ce)) { > >> + WRITE_ONCE(ce->stats.active, 0); > > > > This is related to the question above about where is ce->stats.active being > > updated in GuC case. If it is not being updated then we wouldn't have to do > > this here, we could just initialize it to 0 once or it might already be > > initialized to 0 (if say ce->stats was kzalloc'd). > > Agree, will drop this. > > > > > /> + return; > >> + } > >> + > >> + guc_context_update_clks(ce); > >> + intel_context_unpin(ce); > >> +} > >> + > >> static inline bool > >> submission_disabled(struct intel_guc *guc) > >> { > >> @@ -2774,6 +2801,7 @@ static void guc_context_unpin(struct intel_context *ce) > >> { > >> struct intel_guc *guc = ce_to_guc(ce); > >> > >> + lrc_update_runtime(ce); > > > > If we call lrc_update_runtime from these 3 code paths (as is done in this > > patch), we would need to hold guc->timestamp.lock here. Is that happening > > (I don't see it) or we need to call guc_context_update_clks() here? I am > > assuming the context image is pinned here so pinning is not an issue. > > Maybe will just call guc_context_update_clks() here. If active is not there, it would be best if we call the same function from all 3 places (unless the lock is already taken somewhere). Basically something like: static void guc_context_update_stats(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); unsigned long flags; intel_context_pin(ce); spin_lock_irqsave(&guc->timestamp.lock, flags); lrc_update_runtime(ce); spin_unlock_irqrestore(&guc->timestamp.lock, flags); intel_context_unpin(ce); } And call this from everywhere. Thanks. -- Ashutosh > > > >> unpin_guc_id(guc, ce); > >> lrc_unpin(ce); > >> > >> @@ -3455,6 +3483,7 @@ static void remove_from_context(struct i915_request *rq) > >> } > >> > >> static const struct intel_context_ops guc_context_ops = { > >> + .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL, > >> .alloc = guc_context_alloc, > >> > >> .close = guc_context_close, > >> @@ -3473,6 +3502,8 @@ static const struct intel_context_ops guc_context_ops = { > >> > >> .sched_disable = guc_context_sched_disable, > >> > >> + .update_stats = guc_context_update_stats, > >> + > >> .reset = lrc_reset, > >> .destroy = guc_context_destroy, > >> > >> @@ -3728,6 +3759,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce) > >> } > >> > >> static const struct intel_context_ops virtual_guc_context_ops = { > >> + .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL, > >> .alloc = guc_virtual_context_alloc, > >> > >> .close = guc_context_close, > >> @@ -3745,6 +3777,7 @@ static const struct intel_context_ops virtual_guc_context_ops = { > >> .exit = guc_virtual_context_exit, > >> > >> .sched_disable = guc_context_sched_disable, > >> + .update_stats = guc_context_update_stats, > >> > >> .destroy = guc_context_destroy, > >> > >> -- > >> 2.36.1