Re: [PATCH 1/2] i915/pmu: Add support for total context runtime for GuC back-end

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Apr 24, 2023 at 10:41:41AM -0700, Dixit, Ashutosh wrote:
On Tue, 04 Apr 2023 17:14:32 -0700, Umesh Nerlige Ramappa wrote:

Hi Umesh,

GPU accumulates the context runtime in a 32 bit counter - CTX_TIMESTAMP
in the context image. This value is saved/restored on context switches.
KMD accumulates these values into a 64 bit counter taking care of any
overflows as needed. This count provides the basis for client specific
busyness in the fdinfo interface.

KMD accumulation happens just before the context is unpinned and when
context switches out. This works for execlist back-end since execlist
scheduling has visibility into context switches. With GuC mode, KMD does
not have visibility into context switches and this counter is
accumulated only when context is unpinned. Context is unpinned once the
context scheduling is successfully disabled. Disabling context
scheduling is an asynchronous operation.

This means guc_context_unpin() is called asynchronously, correct? From
guc_context_sched_disable()?

correct

Also if a context is servicing frequent requests, scheduling may never be
disabled on it.

For GuC mode, since updates to the context runtime may be delayed, add
hooks to update the context runtime in a worker thread as well as when
a user queries for it.

Limitation:
- If a context is never switched out or runs for a long period of time,
  the runtime value of CTX_TIMESTAMP may never be updated, so the
  counter value may be unreliable. This patch does not support such
  cases. Such support must be available from the GuC FW and it is WIP.

This patch is an extract from previous work authored by John/Umesh here -
https://patchwork.freedesktop.org/patch/496441/?series=105085&rev=4

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@xxxxxxxxx>
Co-developed-by: John Harrison <John.C.Harrison@xxxxxxxxx>
Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx>
---
 drivers/gpu/drm/i915/gt/intel_context.c       | 12 +++++--
 drivers/gpu/drm/i915/gt/intel_context.h       |  2 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  5 +++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 33 +++++++++++++++++++
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 2aa63ec521b8..e01f222e9e42 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -578,16 +578,24 @@ void intel_context_bind_parent_child(struct intel_context *parent,
	child->parallel.parent = parent;
 }

-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
 {
	u64 total, active;

+	if (ce->ops->update_stats)
+		ce->ops->update_stats(ce);
+
	total = ce->stats.runtime.total;
	if (ce->ops->flags & COPS_RUNTIME_CYCLES)
		total *= ce->engine->gt->clock_period_ns;

	active = READ_ONCE(ce->stats.active);
-	if (active)
+	/*
+	 * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops, the backend
+	 * already provides the total active time of the context,

Where is this done in the GuC case? I looked but couldn't find it (at least
in this version of the patch, it is there in the old version).

In this case, the backend is not providing the total active time, I guess I should drop this logic provided ce->stats.active is 0 for GuC mode.


+	 * so skip this calculation when this flag is set.
+	 */
+	if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
		active = intel_context_clock() - active;

	return total + active;
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 0a8d553da3f4..720809523e2d 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -368,7 +368,7 @@ intel_context_clear_nopreempt(struct intel_context *ce)
	clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }

-u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);

 static inline u64 intel_context_clock(void)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index e36670f2e626..58b0294d359d 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -38,6 +38,9 @@ struct intel_context_ops {
 #define COPS_RUNTIME_CYCLES_BIT 1
 #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)

+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
+
	int (*alloc)(struct intel_context *ce);

	void (*revoke)(struct intel_context *ce, struct i915_request *rq,
@@ -58,6 +61,8 @@ struct intel_context_ops {

	void (*sched_disable)(struct intel_context *ce);

+	void (*update_stats)(struct intel_context *ce);
+
	void (*reset)(struct intel_context *ce);
	void (*destroy)(struct kref *kref);

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 88e881b100cf..8048a3e97a68 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1402,13 +1402,25 @@ static void __update_guc_busyness_stats(struct intel_guc *guc)
	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
 }

+static void guc_context_update_clks(struct intel_context *ce)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	unsigned long flags;
+
+	spin_lock_irqsave(&guc->timestamp.lock, flags);
+	lrc_update_runtime(ce);
+	spin_unlock_irqrestore(&guc->timestamp.lock, flags);
+}
+
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
	struct intel_guc *guc = container_of(wrk, typeof(*guc),
					     timestamp.work.work);
	struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
	struct intel_gt *gt = guc_to_gt(guc);
+	struct intel_context *ce;
	intel_wakeref_t wakeref;
+	unsigned long index;
	int srcu, ret;

	/*
@@ -1424,6 +1436,10 @@ static void guc_timestamp_ping(struct work_struct *wrk)
	with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
		__update_guc_busyness_stats(guc);

How do we know the context images are pinned at this point which is needed
for the code below? Where is the pinning happening?

Ah, maybe this should just call - guc_context_update_stats()

+	/* adjust context stats for overflow */
+	xa_for_each(&guc->context_lookup, index, ce)
+		guc_context_update_clks(ce);

Here are we saying that we need to do this because the context can get
switched out (and context image saved) and back in multiple times without
the context getting unpinned? Otherwise guc_context_unpin() would call
lrc_update_runtime() and we wouldn't have to do this here.

Mainly for 32 bit overflows. The busyness value obtained from the context image is a 32 bit value and could wrap around. If we keep grabbing it periodically and accumulate it in the 64 bit value in ce stats, we should be good.


+
	intel_gt_reset_unlock(gt, srcu);

	guc_enable_busyness_worker(guc);
@@ -1505,6 +1521,17 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
	guc_enable_busyness_worker(guc);
 }

+static void guc_context_update_stats(struct intel_context *ce)
+{
+	if (!intel_context_pin_if_active(ce)) {
+		WRITE_ONCE(ce->stats.active, 0);

This is related to the question above about where is ce->stats.active being
updated in GuC case. If it is not being updated then we wouldn't have to do
this here, we could just initialize it to 0 once or it might already be
initialized to 0 (if say ce->stats was kzalloc'd).

Agree, will drop this.


/> +		return;
+	}
+
+	guc_context_update_clks(ce);
+	intel_context_unpin(ce);
+}
+
 static inline bool
 submission_disabled(struct intel_guc *guc)
 {
@@ -2774,6 +2801,7 @@ static void guc_context_unpin(struct intel_context *ce)
 {
	struct intel_guc *guc = ce_to_guc(ce);

+	lrc_update_runtime(ce);

If we call lrc_update_runtime from these 3 code paths (as is done in this
patch), we would need to hold guc->timestamp.lock here. Is that happening
(I don't see it) or we need to call guc_context_update_clks() here? I am
assuming the context image is pinned here so pinning is not an issue.

Maybe will just call guc_context_update_clks() here.

thanks,
Umesh


	unpin_guc_id(guc, ce);
	lrc_unpin(ce);

@@ -3455,6 +3483,7 @@ static void remove_from_context(struct i915_request *rq)
 }

 static const struct intel_context_ops guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
	.alloc = guc_context_alloc,

	.close = guc_context_close,
@@ -3473,6 +3502,8 @@ static const struct intel_context_ops guc_context_ops = {

	.sched_disable = guc_context_sched_disable,

+	.update_stats = guc_context_update_stats,
+
	.reset = lrc_reset,
	.destroy = guc_context_destroy,

@@ -3728,6 +3759,7 @@ static int guc_virtual_context_alloc(struct intel_context *ce)
 }

 static const struct intel_context_ops virtual_guc_context_ops = {
+	.flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
	.alloc = guc_virtual_context_alloc,

	.close = guc_context_close,
@@ -3745,6 +3777,7 @@ static const struct intel_context_ops virtual_guc_context_ops = {
	.exit = guc_virtual_context_exit,

	.sched_disable = guc_context_sched_disable,
+	.update_stats = guc_context_update_stats,

	.destroy = guc_context_destroy,

--
2.36.1


Thanks.
--
Ashutosh



[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux