Re: [PATCH v2 1/3] drm/panthor: introduce job cycle and timestamp accounting

Liviu Dudau <liviu.dudau@xxxxxxx> · Wed, 24 Apr 2024 16:48:37 +0100

Hi Adrián,

On Tue, Apr 23, 2024 at 10:32:34PM +0100, Adrián Larumbe wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before an after
> a user CS.
> 
> Those numbers are stored in the queue's group's sync objects BO, right
> after them. Because the queues in a group might have a different number of
> slots, one must keep track of the overall slot tally when reckoning the
> offset of a queue's time sample structs, one for each slot.
> 
> NUM_INSTRS_PER_SLOT had to be increased to 32 because of adding new FW
> instructions for storing and subtracting the cycle counter and timestamp
> register, and it must always remain a power of two.
> 
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
> 
> Signed-off-by: Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 158 ++++++++++++++++++++----
>  1 file changed, 134 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index b3a51a6de523..320dfa0388ba 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
>  #define MIN_CSGS				3
>  #define MAX_CSG_PRIO				0xf
>  
> +#define NUM_INSTRS_PER_SLOT			32
> +#define SLOTSIZE				(NUM_INSTRS_PER_SLOT * sizeof(u64))
> +
>  struct panthor_group;
>  
>  /**
> @@ -466,6 +469,9 @@ struct panthor_queue {
>  		 */
>  		struct list_head in_flight_jobs;
>  	} fence_ctx;
> +
> +	/** @time_offset: Offset of panthor_job_times structs in group's syncobj bo. */
> +	unsigned long time_offset;

Minor nitpick: I would call it job_times_offset. Or stats_offset. One letter difference
versus syncobjs' times_offset gets harder to read sometimes.

>  };
>  
>  /**
> @@ -580,7 +586,17 @@ struct panthor_group {
>  	 * One sync object per queue. The position of the sync object is
>  	 * determined by the queue index.
>  	 */
> -	struct panthor_kernel_bo *syncobjs;
> +
> +	struct {
> +		/** @bo: Kernel BO holding the sync objects. */
> +		struct panthor_kernel_bo *bo;
> +
> +		/**
> +		 * @times_offset: Beginning of panthor_job_times struct samples after
> +		 * the group's array of sync objects.
> +		 */
> +		size_t times_offset;
> +	} syncobjs;
>  
>  	/** @state: Group state. */
>  	enum panthor_group_state state;
> @@ -639,6 +655,18 @@ struct panthor_group {
>  	struct list_head wait_node;
>  };
>  
> +struct panthor_job_times {
> +	struct {
> +		u64 before;
> +		u64 after;
> +	} cycles;
> +
> +	struct {
> +		u64 before;
> +		u64 after;
> +	} time;
> +};
> +
>  /**
>   * group_queue_work() - Queue a group work
>   * @group: Group to queue the work for.
> @@ -718,6 +746,9 @@ struct panthor_job {
>  	/** @queue_idx: Index of the queue inside @group. */
>  	u32 queue_idx;
>  
> +	/** @ringbuf_idx: Index of the ringbuffer inside @queue. */
> +	u32 ringbuf_idx;
> +
>  	/** @call_info: Information about the userspace command stream call. */
>  	struct {
>  		/** @start: GPU address of the userspace command stream. */
> @@ -833,7 +864,7 @@ static void group_release_work(struct work_struct *work)
>  
>  	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->suspend_buf);
>  	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->protm_suspend_buf);
> -	panthor_kernel_bo_destroy(group->vm, group->syncobjs);
> +	panthor_kernel_bo_destroy(group->vm, group->syncobjs.bo);
>  
>  	panthor_vm_put(group->vm);
>  	kfree(group);
> @@ -1924,8 +1955,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
>  	}
>  }
>  
> -#define NUM_INSTRS_PER_SLOT		16
> -
>  static void
>  group_term_post_processing(struct panthor_group *group)
>  {
> @@ -1962,7 +1991,7 @@ group_term_post_processing(struct panthor_group *group)
>  		spin_unlock(&queue->fence_ctx.lock);
>  
>  		/* Manually update the syncobj seqno to unblock waiters. */
> -		syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
> +		syncobj = group->syncobjs.bo->kmap + (i * sizeof(*syncobj));
>  		syncobj->status = ~0;
>  		syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
>  		sched_queue_work(group->ptdev->scheduler, sync_upd);
> @@ -2729,7 +2758,7 @@ static void group_sync_upd_work(struct work_struct *work)
>  		if (!queue)
>  			continue;
>  
> -		syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
> +		syncobj = group->syncobjs.bo->kmap + (queue_idx * sizeof(*syncobj));
>  
>  		spin_lock(&queue->fence_ctx.lock);
>  		list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> @@ -2764,15 +2793,23 @@ queue_run_job(struct drm_sched_job *sched_job)
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
>  	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> +	u32 ringbuf_index = ringbuf_insert / (SLOTSIZE);
>  	u64 addr_reg = ptdev->csif_info.cs_reg_count -
>  		       ptdev->csif_info.unpreserved_cs_reg_count;
>  	u64 val_reg = addr_reg + 2;
> -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
> +	u64 cycle_reg = addr_reg;
> +	u64 time_reg = val_reg;
> +	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) +
> +		job->queue_idx * sizeof(struct panthor_syncobj_64b);
> +	u64 times_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) + queue->time_offset +
> +		(ringbuf_index * sizeof(struct panthor_job_times));
> +
>  	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>  	struct dma_fence *done_fence;
>  	int ret;
>  
> +	drm_WARN_ON(&ptdev->base, ringbuf_insert >= ringbuf_size);

I think we should return an error here, rather than warn. What would be the point to continue?

> +
>  	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
>  		/* MOV32 rX+2, cs.latest_flush */
>  		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> @@ -2780,6 +2817,18 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
>  		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
>  
> +		/* MOV48 rX:rX+1, cycles_offset */
> +		(1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.before)),
> +
> +		/* MOV48 rX:rX+1, time_offset */
> +		(1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.before)),
> +
> +		/* STORE_STATE cycles */
> +		(40ull << 56) |  (cycle_reg << 40) | (1ll << 32),
> +
> +		/* STORE_STATE timer */
> +		(40ull << 56) |  (time_reg << 40) | (0ll << 32),
> +
>  		/* MOV48 rX:rX+1, cs.start */
>  		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
>  
> @@ -2792,6 +2841,18 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		/* CALL rX:rX+1, rX+2 */
>  		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
>  
> +		/* MOV48 rX:rX+1, cycles_offset */
> +		(1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.after)),
> +
> +		/* MOV48 rX:rX+1, time_offset */
> +		(1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.after)),
> +
> +		/* STORE_STATE cycles */
> +		(40ull << 56) |  (cycle_reg << 40) | (1ll << 32),
> +
> +		/* STORE_STATE timer */
> +		(40ull << 56) |  (time_reg << 40) | (0ll << 32),
> +
>  		/* MOV48 rX:rX+1, sync_addr */
>  		(1ull << 56) | (addr_reg << 48) | sync_addr,
>  
> @@ -2846,6 +2907,7 @@ queue_run_job(struct drm_sched_job *sched_job)
>  
>  	job->ringbuf.start = queue->iface.input->insert;
>  	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> +	job->ringbuf_idx = ringbuf_index;
>  
>  	/* Make sure the ring buffer is updated before the INSERT
>  	 * register.
> @@ -2936,7 +2998,8 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
>  
>  static struct panthor_queue *
>  group_create_queue(struct panthor_group *group,
> -		   const struct drm_panthor_queue_create *args)
> +		   const struct drm_panthor_queue_create *args,
> +		   unsigned int slots_so_far)
>  {
>  	struct drm_gpu_scheduler *drm_sched;
>  	struct panthor_queue *queue;
> @@ -2987,9 +3050,12 @@ group_create_queue(struct panthor_group *group,
>  		goto err_free_queue;
>  	}
>  
> +	queue->time_offset = group->syncobjs.times_offset +
> +		(slots_so_far * sizeof(struct panthor_job_times));
> +
>  	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
>  			     group->ptdev->scheduler->wq, 1,
> -			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> +			     args->ringbuf_size / SLOTSIZE,
>  			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
>  			     group->ptdev->reset.wq,
>  			     NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3017,7 +3083,9 @@ int panthor_group_create(struct panthor_file *pfile,
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
>  	struct panthor_group *group = NULL;
> +	unsigned int total_slots;
>  	u32 gid, i, suspend_size;
> +	size_t syncobj_bo_size;
>  	int ret;
>  
>  	if (group_args->pad)
> @@ -3083,33 +3151,75 @@ int panthor_group_create(struct panthor_file *pfile,
>  		goto err_put_group;
>  	}
>  
> -	group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
> -						   group_args->queues.count *
> -						   sizeof(struct panthor_syncobj_64b),
> -						   DRM_PANTHOR_BO_NO_MMAP,
> -						   DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> -						   DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> -						   PANTHOR_VM_KERNEL_AUTO_VA);
> -	if (IS_ERR(group->syncobjs)) {
> -		ret = PTR_ERR(group->syncobjs);
> +	/*
> +	 * Need to add size for the panthor_job_times structs, as many as the sum
> +	 * of the number of job slots for every single queue ringbuffer.
> +	 */
> +	for (i = 0, total_slots = 0; i < group_args->queues.count; i++)
> +		total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
> +
> +	syncobj_bo_size = (group_args->queues.count * sizeof(struct panthor_syncobj_64b))
> +		+ (total_slots * sizeof(struct panthor_job_times));
> +
> +	/*
> +	 * Memory layout of group's syncobjs BO
> +	 * group->syncobjs.bo {
> +	 *	struct panthor_syncobj_64b sync1;
> +	 *	struct panthor_syncobj_64b sync2;
> +	 *		...
> +	 *		As many as group_args->queues.count
> +	 *		...
> +	 *	struct panthor_syncobj_64b syncn;
> +	 *	struct panthor_job_times queue1_slot1
> +	 *	struct panthor_job_times queue1_slot2
> +	 *		...
> +	 *		As many as queue[i].ringbuf_size / SLOTSIZE
> +	 *		...
> +	 *	struct panthor_job_times queue1_slotP
> +	 *		...
> +	 *		As many as group_args->queues.count
> +	 *		...
> +	 *	struct panthor_job_times queueN_slot1
> +	 *	struct panthor_job_times queueN_slot2
> +	 *		...
> +	 *		As many as queue[n].ringbuf_size / SLOTSIZE
> +	 *	struct panthor_job_times queueN_slotQ
> +	 *
> +	 *	Linearly, group->syncobjs.bo = {syncojb1,..,syncobjN,
> +	 *	{queue1 = {js1,..,jsP},..,queueN = {js1,..,jsQ}}}
> +	 * }
> +	 *
> +	 */
> +
> +	group->syncobjs.bo = panthor_kernel_bo_create(ptdev, group->vm,
> +						      syncobj_bo_size,
> +						      DRM_PANTHOR_BO_NO_MMAP,
> +						      DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> +						      DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> +						      PANTHOR_VM_KERNEL_AUTO_VA);
> +	if (IS_ERR(group->syncobjs.bo)) {
> +		ret = PTR_ERR(group->syncobjs.bo);
>  		goto err_put_group;
>  	}
>  
> -	ret = panthor_kernel_bo_vmap(group->syncobjs);
> +	ret = panthor_kernel_bo_vmap(group->syncobjs.bo);
>  	if (ret)
>  		goto err_put_group;
>  
> -	memset(group->syncobjs->kmap, 0,
> -	       group_args->queues.count * sizeof(struct panthor_syncobj_64b));
> +	memset(group->syncobjs.bo->kmap, 0, syncobj_bo_size);
> +
> +	group->syncobjs.times_offset =
> +		group_args->queues.count * sizeof(struct panthor_syncobj_64b);
>  
> -	for (i = 0; i < group_args->queues.count; i++) {
> -		group->queues[i] = group_create_queue(group, &queue_args[i]);
> +	for (i = 0, total_slots = 0; i < group_args->queues.count; i++) {
> +		group->queues[i] = group_create_queue(group, &queue_args[i], total_slots);
>  		if (IS_ERR(group->queues[i])) {
>  			ret = PTR_ERR(group->queues[i]);
>  			group->queues[i] = NULL;
>  			goto err_put_group;
>  		}
>  
> +		total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
>  		group->queue_count++;
>  	}
>  
> -- 
> 2.44.0
>

With those comments addressed,
Reviewed-by: Liviu Dudau <liviu.dudau@xxxxxxx>

Best regards,
Liviu

-- 
====================
| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---------------
    ¯\_(ツ)_/¯