Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting

Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> · Thu, 12 Sep 2024 17:10:10 +0200

On Thu, 12 Sep 2024 16:03:37 +0100
Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx> wrote:

> Hi Boris, thanks for the remarks,
> 
> On 04.09.2024 09:49, Boris Brezillon wrote:
> > On Tue,  3 Sep 2024 21:25:35 +0100
> > Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx> wrote:
> >   
> > > Enable calculations of job submission times in clock cycles and wall
> > > time. This is done by expanding the boilerplate command stream when running
> > > a job to include instructions that compute said times right before an after
> > > a user CS.
> > > 
> > > A separate kernel BO is created per queue to store those values. Jobs can
> > > access their sampled data through a slots buffer-specific index different
> > > from that of the queue's ringbuffer. The reason for this is saving memory
> > > on the profiling information kernel BO, since the amount of simultaneous
> > > profiled jobs we can write into the queue's ringbuffer might be much
> > > smaller than for regular jobs, as the former take more CSF instructions.
> > > 
> > > This commit is done in preparation for enabling DRM fdinfo support in the
> > > Panthor driver, which depends on the numbers calculated herein.
> > > 
> > > A profile mode mask has been added that will in a future commit allow UM to
> > > toggle performance metric sampling behaviour, which is disabled by default
> > > to save power. When a ringbuffer CS is constructed, timestamp and cycling
> > > sampling instructions are added depending on the enabled flags in the
> > > profiling mask.
> > > 
> > > A helper was provided that calculates the number of instructions for a
> > > given set of enablement mask, and these are passed as the number of credits
> > > when initialising a DRM scheduler job.
> > > 
> > > Signed-off-by: Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx>
> > > ---
> > >  drivers/gpu/drm/panthor/panthor_device.h |  22 ++
> > >  drivers/gpu/drm/panthor/panthor_sched.c  | 327 ++++++++++++++++++++---
> > >  2 files changed, 305 insertions(+), 44 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > > index e388c0472ba7..a48e30d0af30 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > > @@ -66,6 +66,25 @@ struct panthor_irq {
> > >  	atomic_t suspended;
> > >  };
> > >  
> > > +/**
> > > + * enum panthor_device_profiling_mode - Profiling state
> > > + */
> > > +enum panthor_device_profiling_flags {
> > > +	/** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> > > +	PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> > > +
> > > +	/** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> > > +	PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> > > +
> > > +	/** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> > > +	PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> > > +
> > > +	/** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> > > +	PANTHOR_DEVICE_PROFILING_ALL =
> > > +	PANTHOR_DEVICE_PROFILING_CYCLES |
> > > +	PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +};
> > > +
> > >  /**
> > >   * struct panthor_device - Panthor device
> > >   */
> > > @@ -162,6 +181,9 @@ struct panthor_device {
> > >  		 */
> > >  		struct page *dummy_latest_flush;
> > >  	} pm;
> > > +
> > > +	/** @profile_mask: User-set profiling flags for job accounting. */
> > > +	u32 profile_mask;
> > >  };
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > > index c426a392b081..b087648bf59a 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > > @@ -93,6 +93,9 @@
> > >  #define MIN_CSGS				3
> > >  #define MAX_CSG_PRIO				0xf
> > >  
> > > +#define NUM_INSTRS_PER_CACHE_LINE		(64 / sizeof(u64))
> > > +#define MAX_INSTRS_PER_JOB			32
> > > +
> > >  struct panthor_group;
> > >  
> > >  /**
> > > @@ -476,6 +479,18 @@ struct panthor_queue {
> > >  		 */
> > >  		struct list_head in_flight_jobs;
> > >  	} fence_ctx;
> > > +
> > > +	/** @profiling_info: Job profiling data slots and access information. */
> > > +	struct {
> > > +		/** @slots: Kernel BO holding the slots. */
> > > +		struct panthor_kernel_bo *slots;
> > > +
> > > +		/** @slot_count: Number of jobs ringbuffer can hold at once. */
> > > +		u32 slot_count;
> > > +
> > > +		/** @profiling_seqno: Index of the next available profiling information slot. */
> > > +		u32 profiling_seqno;  
> > 
> > Nit: no need to repeat profiling as it's under the profiling_info
> > struct. I would simply name that one 'seqno'.
> >   
> > > +	} profiling_info;  
> > 
> > s/profiling_info/profiling/ ?
> >   
> > >  };
> > >  
> > >  /**
> > > @@ -661,6 +676,18 @@ struct panthor_group {
> > >  	struct list_head wait_node;
> > >  };
> > >  
> > > +struct panthor_job_profiling_data {
> > > +	struct {
> > > +		u64 before;
> > > +		u64 after;
> > > +	} cycles;
> > > +
> > > +	struct {
> > > +		u64 before;
> > > +		u64 after;
> > > +	} time;
> > > +};
> > > +
> > >  /**
> > >   * group_queue_work() - Queue a group work
> > >   * @group: Group to queue the work for.
> > > @@ -774,6 +801,12 @@ struct panthor_job {
> > >  
> > >  	/** @done_fence: Fence signaled when the job is finished or cancelled. */
> > >  	struct dma_fence *done_fence;
> > > +
> > > +	/** @profile_mask: Current device job profiling enablement bitmask. */
> > > +	u32 profile_mask;
> > > +
> > > +	/** @profile_slot: Job profiling information index in the profiling slots BO. */
> > > +	u32 profiling_slot;  
> > 
> > Nit: we tend to group fields together under sub-structs, so I'd say:
> > 
> > 	struct {
> > 		u32 mask; // or flags
> > 		u32 slot;
> > 	} profiling;
> >   
> > >  };
> > >  
> > >  static void
> > > @@ -838,6 +871,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> > >  
> > >  	panthor_kernel_bo_destroy(queue->ringbuf);
> > >  	panthor_kernel_bo_destroy(queue->iface.mem);
> > > +	panthor_kernel_bo_destroy(queue->profiling_info.slots);
> > >  
> > >  	/* Release the last_fence we were holding, if any. */
> > >  	dma_fence_put(queue->fence_ctx.last_fence);
> > > @@ -1982,8 +2016,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> > >  	}
> > >  }
> > >  
> > > -#define NUM_INSTRS_PER_SLOT		16
> > > -
> > >  static void
> > >  group_term_post_processing(struct panthor_group *group)
> > >  {
> > > @@ -2815,65 +2847,211 @@ static void group_sync_upd_work(struct work_struct *work)
> > >  	group_put(group);
> > >  }
> > >  
> > > -static struct dma_fence *
> > > -queue_run_job(struct drm_sched_job *sched_job)
> > > +struct panthor_job_ringbuf_instrs {
> > > +	u64 buffer[MAX_INSTRS_PER_JOB];
> > > +	u32 count;
> > > +};
> > > +
> > > +struct panthor_job_instr {
> > > +	u32 profile_mask;
> > > +	u64 instr;
> > > +};
> > > +
> > > +#define JOB_INSTR(__prof, __instr) \
> > > +	{ \
> > > +		.profile_mask = __prof, \
> > > +		.instr = __instr, \
> > > +	}
> > > +
> > > +static void
> > > +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> > > +		       struct panthor_job *job,
> > > +		       struct panthor_job_ringbuf_instrs *instrs)
> > > +{
> > > +	ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > > +	u32 start = job->ringbuf.start & (ringbuf_size - 1);
> > > +	ssize_t size, written;
> > > +
> > > +	/*
> > > +	 * We need to write a whole slot, including any trailing zeroes
> > > +	 * that may come at the end of it. Also, because instrs.buffer had
> > > +	 * been zero-initialised, there's no need to pad it with 0's
> > > +	 */
> > > +	instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> > > +	size = instrs->count * sizeof(u64);
> > > +	written = min(ringbuf_size - start, size);
> > > +
> > > +	memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> > > +
> > > +	if (written < size)
> > > +		memcpy(queue->ringbuf->kmap,
> > > +		       &instrs->buffer[(ringbuf_size - start)/sizeof(u64)],
> > > +		       size - written);
> > > +}
> > > +
> > > +struct panthor_job_cs_params {
> > > +	u32 profile_mask;
> > > +	u64 addr_reg; u64 val_reg;
> > > +	u64 cycle_reg; u64 time_reg;
> > > +	u64 sync_addr; u64 times_addr;
> > > +	u64 cs_start; u64 cs_size;
> > > +	u32 last_flush; u32 waitall_mask;
> > > +};
> > > +
> > > +static void
> > > +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> > >  {
> > > -	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> > >  	struct panthor_group *group = job->group;
> > >  	struct panthor_queue *queue = group->queues[job->queue_idx];
> > >  	struct panthor_device *ptdev = group->ptdev;
> > >  	struct panthor_scheduler *sched = ptdev->scheduler;
> > > -	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > > -	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> > > -	u64 addr_reg = ptdev->csif_info.cs_reg_count -
> > > -		       ptdev->csif_info.unpreserved_cs_reg_count;
> > > -	u64 val_reg = addr_reg + 2;
> > > -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > > -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > > -	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > > -	struct dma_fence *done_fence;
> > > -	int ret;
> > >  
> > > -	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> > > +	params->addr_reg = ptdev->csif_info.cs_reg_count -
> > > +			   ptdev->csif_info.unpreserved_cs_reg_count;
> > > +	params->val_reg = params->addr_reg + 2;
> > > +	params->cycle_reg = params->addr_reg;
> > > +	params->time_reg = params->val_reg;
> > > +
> > > +	params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > > +			    job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > > +	params->times_addr = panthor_kernel_bo_gpuva(queue->profiling_info.slots) +
> > > +			     (job->profiling_slot * sizeof(struct panthor_job_profiling_data));
> > > +	params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > > +
> > > +	params->cs_start = job->call_info.start;
> > > +	params->cs_size = job->call_info.size;
> > > +	params->last_flush = job->call_info.latest_flush;
> > > +
> > > +	params->profile_mask = job->profile_mask;
> > > +}
> > > +
> > > +static void
> > > +prepare_job_instrs(const struct panthor_job_cs_params *params,
> > > +		   struct panthor_job_ringbuf_instrs *instrs)
> > > +{
> > > +	const struct panthor_job_instr instr_seq[] = {
> > >  		/* MOV32 rX+2, cs.latest_flush */
> > > -		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (2ull << 56) | (params->val_reg << 48) | params->last_flush),
> > >  
> > >  		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> > > -		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> > > +
> > > +		/* MOV48 rX:rX+1, cycles_offset */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > +			  (1ull << 56) | (params->cycle_reg << 48) |
> > > +			  (params->times_addr +
> > > +			   offsetof(struct panthor_job_profiling_data, cycles.before))),
> > > +		/* STORE_STATE cycles */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > +			  (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > > +		/* MOV48 rX:rX+1, time_offset */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +			  (1ull << 56) | (params->time_reg << 48) |
> > > +			  (params->times_addr +
> > > +			   offsetof(struct panthor_job_profiling_data, time.before))),
> > > +		/* STORE_STATE timer */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +			  (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> > >  
> > >  		/* MOV48 rX:rX+1, cs.start */
> > > -		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> > >  		/* MOV32 rX+2, cs.size */
> > > -		(2ull << 56) | (val_reg << 48) | job->call_info.size,
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (2ull << 56) | (params->val_reg << 48) | params->cs_size),
> > >  		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> > > -		(3ull << 56) | (1 << 16),
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)),
> > >  		/* CALL rX:rX+1, rX+2 */
> > > -		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> > > +
> > > +		/* MOV48 rX:rX+1, cycles_offset */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > +			  (1ull << 56) | (params->cycle_reg << 48) |
> > > +			  (params->times_addr +
> > > +			   offsetof(struct panthor_job_profiling_data, cycles.after))),
> > > +		/* STORE_STATE cycles */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > +			  (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > > +
> > > +		/* MOV48 rX:rX+1, time_offset */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +			  (1ull << 56) | (params->time_reg << 48) |
> > > +			  (params->times_addr +
> > > +			   offsetof(struct panthor_job_profiling_data, time.after))),
> > > +		/* STORE_STATE timer */
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +			  (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> > >  
> > >  		/* MOV48 rX:rX+1, sync_addr */
> > > -		(1ull << 56) | (addr_reg << 48) | sync_addr,
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> > >  		/* MOV48 rX+2, #1 */
> > > -		(1ull << 56) | (val_reg << 48) | 1,
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (1ull << 56) | (params->val_reg << 48) | 1),
> > >  		/* WAIT(all) */
> > > -		(3ull << 56) | (waitall_mask << 16),
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (3ull << 56) | (params->waitall_mask << 16)),
> > >  		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> > > -		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> > > -
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > +			  (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> > > +			  (params->val_reg << 32) | (0 << 16) | 1),
> > >  		/* ERROR_BARRIER, so we can recover from faults at job
> > >  		 * boundaries.
> > >  		 */
> > > -		(47ull << 56),
> > > +		JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)),
> > > +	};
> > > +	u32 pad;
> > > +
> > > +	/* NEED to be cacheline aligned to please the prefetcher. */
> > > +	static_assert(sizeof(instrs->buffer) % 64 == 0,
> > > +		      "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> > > +
> > > +	/* Make sure we have enough storage to store the whole sequence. */
> > > +	static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) <=
> > > +		      ARRAY_SIZE(instrs->buffer),
> > > +		      "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");  
> > 
> > We probably want to catch situations where instrs->buffer has gone
> > bigger than needed (say we found a way to drop instructions), so I
> > would turn the '<=' condition into an '=='.  
> 
> I did this but it's triggering the static assert, because the instr_seq array has 19 elements,
> which when aligned to NUM_INSTRS_PER_CACHE_LINE (8 I think) renders 24, still less than the
> 32 slots in instrs->buffer. Having a slightly oversized instrs.buffer array isn't a problem
> though because instrs.count is being used when copying them into the ringbuffer, so I think
> leaving this assert as an <= should be alright. 

The whole point is to have a buffer that's the right size, rather than
bigger than needed. So I'd suggest changing the MAX_INSTRS definition
to make it 24 instead.