Re: [Freedreno] [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling

Rob Clark <robdclark@xxxxxxxxx> · Wed, 28 Nov 2018 15:09:56 -0500

On Fri, Oct 26, 2018 at 9:46 AM Sharat Masetty <smasetty@xxxxxxxxxxxxxx> wrote:
>
> Added Rob to this thread.
>
> On 10/17/2018 8:05 PM, Jordan Crouse wrote:
> > On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
> >> This patch attempts to make use of the hardware counters for GPU busy %
> >> estimation when possible and skip using the software counters as it also
> >> accounts for software side delays. This should help give more accurate
> >> representation of the GPU workload.
> >
> > I would leave this more to Rob as this particular file makes more sense for
> > freedreno than it does for us but I think in general if you want to do this
> > then we should do use the hardware for all targets and get rid of the
> > mix of the old and the new.
> Thanks for the comments Jordan. Yes, I prefer the same too, but the only
> limiting factor for me is that I don't have a way to test changes for
> targets such as a3xx and a4xx, and I also do not know if there is anyone
> actually using this currently.
>
> Hi Rob,
> Can you please share your comments on this? Is it okay to remove
> software counters functionality?

In principle yes..  although one side-comment is that there are
patches floating around for a2xx, which is from long enough ago that I
don't remember what the perf-ctr situation is there.

I guess if we can be reasonably confident to implement it w/ hw
counters for all the generations, then I don't see a big need to keep
the sw counter functionality.

I should, in theory, be able to test a3xx/a4xx/a5xx.. a4xx might be a
bit harder to get a recent upstream kernel running on

BR,
-R

>
> Sharat
> >
> >> Signed-off-by: Sharat Masetty <smasetty@xxxxxxxxxxxxxx>
> >> ---
> >>   drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
> >>   drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
> >>   drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
> >>   3 files changed, 34 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> >> index e9b5426..a896541 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.c
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> >> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
> >>      uint32_t elapsed;
> >>      unsigned long flags;
> >>
> >> +    if (gpu->funcs->gpu_busy)
> >> +            return;
> >
> > Like here - there isn't any reason to not have a gpu_busy for each target
> > so then this code could mostly go away.
> >
> >>      spin_lock_irqsave(&gpu->perf_lock, flags);
> >>      if (!gpu->perfcntr_active)
> >>              goto out;
> >> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
> >>      /* we could dynamically enable/disable perfcntr registers too.. */
> >>      gpu->last_sample.active = msm_gpu_active(gpu);
> >>      gpu->last_sample.time = ktime_get();
> >> +    gpu->last_sample.busy_cycles = 0;
> >>      gpu->activetime = gpu->totaltime = 0;
> >>      gpu->perfcntr_active = true;
> >>      update_hw_cntrs(gpu, 0, NULL);
> >> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
> >>      pm_runtime_put_sync(&gpu->pdev->dev);
> >>   }
> >>
> >> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime)
> >> +{
> >> +    ktime_t time;
> >> +
> >> +    *activetime = gpu->funcs->gpu_busy(gpu,
> >> +                    &gpu->last_sample.busy_cycles);
> >> +
> >> +    time = ktime_get();
> >> +    *totaltime = ktime_us_delta(time, gpu->last_sample.time);
> >> +    gpu->last_sample.time = time;
> >> +}
> >
> > This can all be done inline in the sample function below.
> >
> >>   /* returns -errno or # of cntrs sampled */
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >
> > This might be an good change (though if our activetime or totaltime ever
> > goes over 32 bits we've got ourselves a problem) - but it should be in a
> > separate patch.
> >
> >>   {
> >>      unsigned long flags;
> >>      int ret;
> >> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >>              goto out;
> >>      }
> >>
> >> +    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> +
> >> +    if (gpu->funcs->gpu_busy) {
> >> +            msm_gpu_hw_sample(gpu, activetime, totaltime);
> >> +            goto out;
> >> +    }
> >> +
> >>      *activetime = gpu->activetime;
> >>      *totaltime = gpu->totaltime;
> >>
> >>      gpu->activetime = gpu->totaltime = 0;
> >>
> >> -    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> -
> >>   out:
> >>      spin_unlock_irqrestore(&gpu->perf_lock, flags);
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> >> index 0ff23ca..7dc775f 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.h
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> >> @@ -90,6 +90,7 @@ struct msm_gpu {
> >>      struct {
> >>              bool active;
> >>              ktime_t time;
> >> +            u64 busy_cycles;
> >>      } last_sample;
> >>      uint32_t totaltime, activetime;    /* sw counters */
> >>      uint32_t last_cntrs[5];            /* hw counters */
> >> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
> >>
> >>   void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
> >>   void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >>
> >>   void msm_gpu_retire(struct msm_gpu *gpu);
> >>   void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
> >> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
> >> index 5ab21bd..318f7dd 100644
> >> --- a/drivers/gpu/drm/msm/msm_perf.c
> >> +++ b/drivers/gpu/drm/msm/msm_perf.c
> >> @@ -17,7 +17,7 @@
> >>
> >>   /* For profiling, userspace can:
> >>    *
> >> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
> >> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
> >>    *
> >>    * This will enable performance counters/profiling to track the busy time
> >>    * and any gpu specific performance counters that are supported.
> >> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
> >>              }
> >>      } else {
> >>              /* Sample line: */
> >> -            uint32_t activetime = 0, totaltime = 0;
> >> +            uint64_t activetime = 0, totaltime = 0;
> >
> > All the changes to msm_perf.c shuld also be in a separate patch that moves
> > the sample size to u64.
> >
> >>              uint32_t cntrs[5];
> >> -            uint32_t val;
> >> +            uint64_t val;
> >>              int ret;
> >>
> >>              /* sleep until next sample time: */
> >> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
> >>                      return ret;
> >>
> >>              val = totaltime ? 1000 * activetime / totaltime : 0;
> >> -            n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
> >> +            n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
> >>              ptr += n;
> >>              rem -= n;
> >>
> >>              for (i = 0; i < ret; i++) {
> >>                      /* cycle counters (I think).. convert to MHz.. */
> >>                      val = cntrs[i] / 10000;
> >> -                    n = snprintf(ptr, rem, "\t%5d.%02d",
> >> +                    n = snprintf(ptr, rem, "\t%5llu.%02llu",
> >>                                      val / 100, val % 100);
> >>                      ptr += n;
> >>                      rem -= n;
> >> --
> >> 1.9.1
> >>
> >
>
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> Linux Foundation Collaborative Project