Re: [PATCH 1/3] drm/i915/guc: Support new and improved engine busyness

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/3/2023 13:58, Umesh Nerlige Ramappa wrote:
On Fri, Sep 22, 2023 at 03:25:08PM -0700, John.C.Harrison@xxxxxxxxx wrote:
From: John Harrison <John.C.Harrison@xxxxxxxxx>

The GuC has been extended to support a much more friendly engine
busyness interface. So partition the old interface into a 'busy_v1'
space and add 'busy_v2' support alongside. And if v2 is available, use
that in preference to v1. Note that v2 provides extra features over
and above v1 which will be exposed via PMU in subsequent patches.

Since we are thinking of using the existing busyness counter to expose the v2 values, we can drop the last sentence from above.


Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx>
---
drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
.../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  82 ++--
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  55 ++-
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   9 +-
drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++++++++++++++----
7 files changed, 427 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a7e6775980043..40fd8f984d64b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
    ktime_t start;
};

-struct intel_engine_guc_stats {
+struct intel_engine_guc_stats_v1 {
    /**
     * @running: Active state of the engine when busyness was last sampled.
     */
@@ -603,7 +603,7 @@ struct intel_engine_cs {
    struct {
        union {
            struct intel_engine_execlists_stats execlists;
-            struct intel_engine_guc_stats guc;
+            struct intel_engine_guc_stats_v1 guc_v1;
        };

Overall, I would suggest having the renames as a separate patch. Would make the review easier.


        /**
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index f359bef046e0b..c190a99a36c38 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -137,7 +137,9 @@ enum intel_guc_action {
    INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
    INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
    INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
-    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
+    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
+    INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
+    INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
    INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
    INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
    INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 6c392bad29c19..e6502ab5f049f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -226,45 +226,61 @@ struct intel_guc {
    struct mutex send_mutex;

    /**
-     * @timestamp: GT timestamp object that stores a copy of the timestamp
-     * and adjusts it for overflow using a worker.
+     * @busy: Data used by the different versions of engine busyness implementations.
     */
-    struct {
-        /**
-         * @lock: Lock protecting the below fields and the engine stats.
-         */
-        spinlock_t lock;
-
-        /**
-         * @gt_stamp: 64 bit extended value of the GT timestamp.
-         */
-        u64 gt_stamp;
-
-        /**
-         * @ping_delay: Period for polling the GT timestamp for
-         * overflow.
-         */
-        unsigned long ping_delay;
-
-        /**
-         * @work: Periodic work to adjust GT timestamp, engine and
-         * context usage for overflows.
-         */
-        struct delayed_work work;
-
+    union {
        /**
-         * @shift: Right shift value for the gpm timestamp
+         * @v1: Data used by v1 engine busyness implementation. Mostly a copy +         * of the GT timestamp extended to 64 bits and the worker for maintaining it.
         */
-        u32 shift;
+        struct {
+            /**
+             * @lock: Lock protecting the below fields and the engine stats.
+             */
+            spinlock_t lock;
+
+            /**
+             * @gt_stamp: 64 bit extended value of the GT timestamp.
+             */
+            u64 gt_stamp;
+
+            /**
+             * @ping_delay: Period for polling the GT timestamp for
+             * overflow.
+             */
+            unsigned long ping_delay;
+
+            /**
+             * @work: Periodic work to adjust GT timestamp, engine and
+             * context usage for overflows.
+             */
+            struct delayed_work work;
+
+            /**
+             * @shift: Right shift value for the gpm timestamp
+             */
+            u32 shift;
+
+            /**
+             * @last_stat_jiffies: jiffies at last actual stats collection time
+             * We use this timestamp to ensure we don't oversample the
+             * stats because runtime power management events can trigger
+             * stats collection at much higher rates than required.
+             */
+            unsigned long last_stat_jiffies;
+        } v1;

        /**
-         * @last_stat_jiffies: jiffies at last actual stats collection time
-         * We use this timestamp to ensure we don't oversample the
-         * stats because runtime power management events can trigger
-         * stats collection at much higher rates than required.
+         * @v2: Data used by v2 engine busyness implementation - a memory object
+         * that is filled in by the GuC and read by the driver.
         */
-        unsigned long last_stat_jiffies;
-    } timestamp;
+        struct {
+            /** @device_vma: object allocated to hold the device level busyness data */
+            struct i915_vma *device_vma;
+            /** @device_map: access object for @device_vma */
+            struct iosys_map device_map;
+        } v2;
+    } busy;

    /**
     * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset. diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 63724e17829a7..1ce595d6816f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -59,7 +59,10 @@ struct __guc_ads_blob {
    struct guc_ads ads;
    struct guc_policies policies;
    struct guc_gt_system_info system_info;
-    struct guc_engine_usage engine_usage;
+    union {
+        struct guc_engine_usage v1;
+        struct guc_function_observation_data v2;
+    } engine_usage;
    /* From here on, location is dynamic! Refer to above diagram. */
    struct guc_mmio_reg regset[];
} __packed;
@@ -948,18 +951,62 @@ void intel_guc_ads_reset(struct intel_guc *guc)
    guc_ads_private_data_reset(guc);
}

-u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
+u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc)
{
    return intel_guc_ggtt_offset(guc, guc->ads_vma) +
        offsetof(struct __guc_ads_blob, engine_usage);
}

-struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine) +struct iosys_map intel_guc_engine_usage_record_map_v1(struct intel_engine_cs *engine)
{
    struct intel_guc *guc = &engine->gt->uc.guc;
    u8 guc_class = engine_class_to_guc_class(engine->class);
    size_t offset = offsetof(struct __guc_ads_blob,
- engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
+ engine_usage.v1.engines[guc_class][ilog2(engine->logical_mask)]);

    return IOSYS_MAP_INIT_OFFSET(&guc->ads_map, offset);
}
+
+int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
+                     struct intel_engine_cs *engine,
+                     u32 guc_vf,
+                     struct iosys_map *engine_map,
+                     struct iosys_map *global_map)
+{
+    size_t offset_global, offset_engine;
+    struct iosys_map *map;
+    u32 instance;
+    u8 guc_class;
+
+    if (engine) {

engine is not being passed as NULL in this patch, so we can drop the checks in this function.

+        guc_class = engine_class_to_guc_class(engine->class);
+        instance = ilog2(engine->logical_mask);
+    }
+
+    if (guc_vf >= GUC_MAX_VF_COUNT) {

Is it possible to split the code in if/else blocks into seperate functions and do away with using guc_vf == ~0U to switch between the 2 logics.
This was the clearest way of organising it that didn't have checkpatch complaining about annoying things.


+        if (guc_vf != ~0U) {
+            guc_err(guc, "Out of range VF in busyness query: 0x%X\n", guc_vf);
+            return -EINVAL;
+        }
+
+        map = &guc->busy.v2.device_map;
+        offset_global = 0;
+
+        if (engine)
+            offset_engine = offsetof(struct guc_engine_observation_data,
+                         engine_data[guc_class][instance]);
+    } else {
+        map = &guc->ads_map;
+        offset_global = offsetof(struct __guc_ads_blob,
+                     engine_usage.v2.function_data[guc_vf]);
+        if (engine)
+            offset_engine = offsetof(struct __guc_ads_blob,
+ engine_usage.v2.function_data[guc_vf].engine_data[guc_class][instance]);

Recommending splitting the vf id based counter support to a future patch.

+    }
+
+    *global_map = IOSYS_MAP_INIT_OFFSET(map, offset_global);
+    if (engine)
+        *engine_map = IOSYS_MAP_INIT_OFFSET(map, offset_engine);
+
+    return 0;
+}

<snip>

+static void __busy_v2_get_engine_usage_record(struct intel_guc *guc,
+                          struct intel_engine_cs *engine,
+                          u64 *_ticks_engine, u64 *_ticks_function,
+                          u64 *_ticks_gt)
+{
+    struct iosys_map rec_map_engine, rec_map_global;
+    u64 ticks_engine, ticks_function, ticks_gt;
+    int i = 0, ret;
+
+    ret = intel_guc_engine_usage_record_map_v2(guc, engine, ~0U,
+                           &rec_map_engine, &rec_map_global);
+    if (ret) {
+        ticks_engine = 0;
+        ticks_function = 0;
+        ticks_gt = 0;
+        goto done;
+    }
+
+#define record_read_engine(map_, field_) \
+    iosys_map_rd_field(map_, 0, struct guc_engine_data, field_)
+#define record_read_global(map_, field_) \
+    iosys_map_rd_field(map_, 0, struct guc_engine_observation_data, field_)
+
+    do {
+        if (engine)
+            ticks_engine = record_read_engine(&rec_map_engine, total_execution_ticks); +        ticks_function = record_read_global(&rec_map_global, total_active_ticks);
+        ticks_gt = record_read_global(&rec_map_global, gt_timestamp);
+
+        if (engine && (record_read_engine(&rec_map_engine, total_execution_ticks) !=
+                   ticks_engine))
+            continue;
+

engine record and global record could use separate functions, maybe like
__busy_v2_get_engine_usage_record
__busy_v2_get_global_usage_record
I originally had them split. But there is so much common code that it was much simpler code-wise to have a single function.

John.


Regards,
Umesh


+        if (record_read_global(&rec_map_global, total_active_ticks) == ticks_function && +            record_read_global(&rec_map_global, gt_timestamp) == ticks_gt)
+            break;
+    } while (++i < 6);
+
+#undef record_read_engine
+#undef record_read_global
+
+done:
+    if (_ticks_engine)
+        *_ticks_engine = ticks_engine;
+    if (_ticks_function)
+        *_ticks_function = ticks_function;
+    if (_ticks_gt)
+        *_ticks_gt = ticks_gt;
+}
+

<snip>




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux