Re: [Freedreno] [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling

Sharat Masetty <smasetty@xxxxxxxxxxxxxx> · Fri, 26 Oct 2018 19:16:48 +0530

Added Rob to this thread.

On 10/17/2018 8:05 PM, Jordan Crouse wrote:
On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
This patch attempts to make use of the hardware counters for GPU busy %
estimation when possible and skip using the software counters as it also
accounts for software side delays. This should help give more accurate
representation of the GPU workload.

I would leave this more to Rob as this particular file makes more sense for
freedreno than it does for us but I think in general if you want to do this
then we should do use the hardware for all targets and get rid of the
mix of the old and the new.
Thanks for the comments Jordan. Yes, I prefer the same too, but the only 
limiting factor for me is that I don't have a way to test changes for 
targets such as a3xx and a4xx, and I also do not know if there is anyone 
actually using this currently.

Hi Rob,
Can you please share your comments on this? Is it okay to remove 
software counters functionality?

Sharat

Signed-off-by: Sharat Masetty <smasetty@xxxxxxxxxxxxxx>
---
  drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
  drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
  drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
  3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index e9b5426..a896541 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
  	uint32_t elapsed;
  	unsigned long flags;
  
+	if (gpu->funcs->gpu_busy)
+		return;

Like here - there isn't any reason to not have a gpu_busy for each target
so then this code could mostly go away.

  	spin_lock_irqsave(&gpu->perf_lock, flags);
  	if (!gpu->perfcntr_active)
  		goto out;
@@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
  	/* we could dynamically enable/disable perfcntr registers too.. */
  	gpu->last_sample.active = msm_gpu_active(gpu);
  	gpu->last_sample.time = ktime_get();
+	gpu->last_sample.busy_cycles = 0;
  	gpu->activetime = gpu->totaltime = 0;
  	gpu->perfcntr_active = true;
  	update_hw_cntrs(gpu, 0, NULL);
@@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
  	pm_runtime_put_sync(&gpu->pdev->dev);
  }
  
+static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime)
+{
+	ktime_t time;
+
+	*activetime = gpu->funcs->gpu_busy(gpu,
+			&gpu->last_sample.busy_cycles);
+
+	time = ktime_get();
+	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
+	gpu->last_sample.time = time;
+}

This can all be done inline in the sample function below.

  /* returns -errno or # of cntrs sampled */
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)

This might be an good change (though if our activetime or totaltime ever
goes over 32 bits we've got ourselves a problem) - but it should be in a
separate patch.

  {
  	unsigned long flags;
  	int ret;
@@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
  		goto out;
  	}
  
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+
+	if (gpu->funcs->gpu_busy) {
+		msm_gpu_hw_sample(gpu, activetime, totaltime);
+		goto out;
+	}
+
  	*activetime = gpu->activetime;
  	*totaltime = gpu->totaltime;
  
  	gpu->activetime = gpu->totaltime = 0;
  
-	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
-
  out:
  	spin_unlock_irqrestore(&gpu->perf_lock, flags);
  
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 0ff23ca..7dc775f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -90,6 +90,7 @@ struct msm_gpu {
  	struct {
  		bool active;
  		ktime_t time;
+		u64 busy_cycles;
  	} last_sample;
  	uint32_t totaltime, activetime;    /* sw counters */
  	uint32_t last_cntrs[5];            /* hw counters */
@@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
  
  void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
  void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
  
  void msm_gpu_retire(struct msm_gpu *gpu);
  void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
index 5ab21bd..318f7dd 100644
--- a/drivers/gpu/drm/msm/msm_perf.c
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -17,7 +17,7 @@
  
  /* For profiling, userspace can:
   *
- *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *   tail -f /sys/kernel/debug/dri/<minor>/perf
   *
   * This will enable performance counters/profiling to track the busy time
   * and any gpu specific performance counters that are supported.
@@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
  		}
  	} else {
  		/* Sample line: */
-		uint32_t activetime = 0, totaltime = 0;
+		uint64_t activetime = 0, totaltime = 0;

All the changes to msm_perf.c shuld also be in a separate patch that moves
the sample size to u64.

  		uint32_t cntrs[5];
-		uint32_t val;
+		uint64_t val;
  		int ret;
  
  		/* sleep until next sample time: */
@@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
  			return ret;
  
  		val = totaltime ? 1000 * activetime / totaltime : 0;
-		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
  		ptr += n;
  		rem -= n;
  
  		for (i = 0; i < ret; i++) {
  			/* cycle counters (I think).. convert to MHz.. */
  			val = cntrs[i] / 10000;
-			n = snprintf(ptr, rem, "\t%5d.%02d",
+			n = snprintf(ptr, rem, "\t%5llu.%02llu",
  					val / 100, val % 100);
  			ptr += n;
  			rem -= n;
--
1.9.1



--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project