Re: [PATCH] i915/pmu: Wire GuC backend to per-client busyness

Umesh Nerlige Ramappa <umesh.nerlige.ramappa@xxxxxxxxx> · Fri, 26 Aug 2022 09:33:08 -0700

On Thu, Aug 25, 2022 at 06:44:50PM -0700, Dixit, Ashutosh wrote:
On Thu, 04 Aug 2022 16:21:25 -0700, Umesh Nerlige Ramappa wrote:

Hi Umesh, I am fairly new to this code so some questions will be below will
be newbie questions, thanks for bearing with me.

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 654a092ed3d6..e2d70a9fdac0 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -576,16 +576,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
	child->parallel.parent = parent;
 }

-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
	u64 total, active;

+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
	total = ce->stats.runtime.total;
	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
		total *= ce->engine->gt->clock_period_ns;

	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
+	 * already provides the total active time of the context, so skip this
+	 * calculation when this flag is set.
+	 */
+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
		active = intel_context_clock() - active;

	return total + active;

/snip/

@@ -1396,6 +1399,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
		__update_guc_busyness_stats(guc);

+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		__guc_context_update_clks(ce);

What is the reason for calling __guc_context_update_clks() periodically
from guc_timestamp_ping() since it appears we should just be able to call
__guc_context_update_clks() from intel_context_get_total_runtime_ns() to
update 'active'? Is the reason for calling __guc_context_update_clks()
periodically that the calculations in __guc_context_update_clks() become
invalid if the counters overflow?

Correct, these are 32-bit counters and the worker just tracks overflow.


+
	intel_gt_reset_unlock(gt, srcu);

	mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
@@ -1469,6 +1476,56 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
			 guc->timestamp.ping_delay);
 }

+static void __guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_gt *gt = ce->engine->gt;
+	u32 *pphwsp, last_switch, engine_id;
+	u64 start_gt_clk, active;
+	unsigned long flags;
+	ktime_t unused;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+
+	/*
+	 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
+	 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
+	 * relies on GuC and GPU for busyness calculations. Due to this, A
+	 * potential race was highlighted in an earlier review that can lead to
+	 * double accounting of busyness. While the solution to this is a wip,
+	 * busyness is still usable for platforms running GuC submission.
+	 */
+	pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
+	last_switch = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
+	engine_id = READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
+
+	guc_update_pm_timestamp(guc, &unused);
+
+	if (engine_id != 0xffffffff && last_switch) {
+		start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
+		__extend_last_switch(guc, &start_gt_clk, last_switch);
+		active = intel_gt_clock_interval_to_ns(gt, guc->timestamp.gt_stamp - start_gt_clk);
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
+		WRITE_ONCE(ce->stats.active, active);

Should not need WRITE_ONCE to update regular memory. Not even sure we need
READ_ONCE above.

Not sure I checked what they do. I was thinking these are needed for the 
memory ordering (as in be sure that start_gt_clk is updated before 
active).


+	} else {
+		lrc_update_runtime(ce);

As was being discussed, should not need this here in this function. See
below too.

In short, I added this here so that a query for busyness following idle 
can be obtained immediately. For GuC backend, the context is unpinned 
after disabling scheduling on that context and that is asynchronous.  
Also if there are more requests on that context, the scheduling may not 
be disabled and unpin may not happen, so updated runtime would only be 
seen much much later.

It is still safe to call from here because we know that the context is 
not active and has switched out. If it did switch in while we were 
reading this, that's still fine, we would only report the value stored 
in the context image.


+	}
+
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
+		WRITE_ONCE(ce->stats.active, 0);

Why do these need to be initialized to 0? Looks like the calculations in
__guc_context_update_clks() will work even if we don't do this? Also I
didn't follow the 'if (!intel_context_pin_if_active(ce))' check.

__guc_context_update_clks accesses the context image, so we need to make 
sure it's pinned. pin if active will not sleep/wait, so we can use it in 
this path.

if context is not active, then we update the active stats to 0.


+		return;
+	}
+
+	__guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2723,6 +2780,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
	struct intel_guc *guc = cce_to_guc(ce);

+	lrc_update_runtime(ce);

How about moving this into lrc_unpin() since that gets called from all guc
context types (parent/child/virtual).

looks like lrc_unpin is called from context_unpin path.

Same as above: for GuC, the context_unpin is an async operation and may 
not happen if there are multiple requests in queue. 

Thanks,
Umesh

	unpin_guc_id(guc, ce);
	lrc_unpin(ce);

@@ -3344,6 +3402,7 @@ static void remove_from_context(struct i915_request *rq)
 }

 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
	.alloc = guc_context_alloc,

	.pre_pin = guc_context_pre_pin,
@@ -3360,6 +3419,8 @@ static const struct intel_context_ops guc_context_ops = {

	.sched_disable = guc_context_sched_disable,

+	.update_stats = guc_context_update_stats,
+
	.reset = lrc_reset,
	.destroy = guc_context_destroy,

@@ -3593,6 +3654,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
 }

 static const struct intel_context_ops virtual_guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
	.alloc = guc_virtual_context_alloc,

	.pre_pin = guc_virtual_context_pre_pin,
@@ -3608,6 +3670,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
	.exit = guc_virtual_context_exit,

	.sched_disable = guc_context_sched_disable,
+	.update_stats = guc_context_update_stats,

	.destroy = guc_context_destroy,

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c
index b09d1d386574..8d81119fff14 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -147,11 +147,7 @@ void i915_drm_client_fdinfo(struct seq_file *m, struct file *f)
		   PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
	seq_printf(m, "drm-client-id:\t%u\n", client->id);

-	/*
-	 * Temporarily skip showing client engine information with GuC submission till
-	 * fetching engine busyness is implemented in the GuC submission backend
-	 */
-	if (GRAPHICS_VER(i915) < 8 || intel_uc_uses_guc_submission(&i915->gt0.uc))
+	if (GRAPHICS_VER(i915) < 8)
		return;

	for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
--
2.37.1