Re: [PATCH v7 5/7] drm/sched: trace dependencies for gpu jobs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 31/01/2025 11:03, Pierre-Eric Pelloux-Prayer wrote:
Trace the fence dependencies similarly to how we print fences:

  ... , dependencies:{fence=606:38006}

This allows tools to analyze the dependencies between the jobs (previously
it was only possible for fences traced by drm_sched_job_wait_dep).

Since drm_sched_job and drm_run_job use the same base event class,
the caller of trace_drm_run_job have to pass a dep_count of 0 (which
is ignored because dependencies are only considered at submit time).

Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@xxxxxxx>
---
  .../gpu/drm/scheduler/gpu_scheduler_trace.h   | 59 ++++++++++++++++---
  drivers/gpu/drm/scheduler/sched_entity.c      |  8 ++-
  drivers/gpu/drm/scheduler/sched_main.c        |  2 +-
  3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
index 3cdd8d8f8021..ca19cd9a146a 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
@@ -26,15 +26,41 @@
#include <linux/stringify.h>
  #include <linux/types.h>
+#include <linux/trace_seq.h>
  #include <linux/tracepoint.h>
#undef TRACE_SYSTEM
  #define TRACE_SYSTEM gpu_scheduler
  #define TRACE_INCLUDE_FILE gpu_scheduler_trace
+#ifndef __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN
+#define __TRACE_EVENT_GPU_SCHEDULER_PRINT_FN
+/* Similar to trace_print_array_seq but for fences. */
+static inline const char *__print_dma_fence_array(struct trace_seq *p, const void *buf, int count)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 *fences = (u64 *) buf;
+	const char *prefix = "";
+
+	trace_seq_putc(p, '{');
+	for (int i = 0; i < count; i++) {
+		u64 context = fences[2 * i], seqno = fences[2 * i + 1];
+
+		trace_seq_printf(p, "%sfence=%llu:%llu",
+				 prefix, context, seqno);
+		prefix = ",";
+	}
+	trace_seq_putc(p, '}');
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+#endif
+
  DECLARE_EVENT_CLASS(drm_sched_job,
-	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity),
-	    TP_ARGS(sched_job, entity),
+	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity,
+		     unsigned int dep_count),
+	    TP_ARGS(sched_job, entity, dep_count),
  	    TP_STRUCT__entry(
  			     __field(uint64_t, id)
  			     __string(name, sched_job->sched->name)
@@ -43,9 +69,14 @@ DECLARE_EVENT_CLASS(drm_sched_job,
  			     __string(dev, dev_name(sched_job->sched->dev))
  			     __field(uint64_t, fence_context)
  			     __field(uint64_t, fence_seqno)
+			     __field(int, n_deps)
+			     __dynamic_array(u64, deps, dep_count * 2)
  			     ),
TP_fast_assign(
+			   unsigned long idx;
+			   struct dma_fence *fence;
+			   u64 *dyn_arr;
  			   __entry->id = sched_job->id;
  			   __assign_str(name);
  			   __entry->job_count = spsc_queue_count(&entity->job_queue);
@@ -54,22 +85,32 @@ DECLARE_EVENT_CLASS(drm_sched_job,
  			   __assign_str(dev);
  			   __entry->fence_context = sched_job->s_fence->finished.context;
  			   __entry->fence_seqno = sched_job->s_fence->finished.seqno;
-
+			   __entry->n_deps = dep_count;
+			   if (dep_count) {
+				dyn_arr = __get_dynamic_array(deps);
+				xa_for_each(&sched_job->dependencies, idx, fence) {
+					dyn_arr[2 * idx] = fence->context;
+					dyn_arr[2 * idx + 1] = fence->seqno;
+				}
+			   }
  			   ),
-	    TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d",
+	    TP_printk("dev=%s, id=%llu, fence=%llu:%llu, ring=%s, job count:%u, hw job count:%d, dependencies:%s",
  		      __get_str(dev), __entry->id,
  		      __entry->fence_context, __entry->fence_seqno, __get_str(name),
-		      __entry->job_count, __entry->hw_job_count)
+		      __entry->job_count, __entry->hw_job_count,
+		      __print_dma_fence_array(p, __get_dynamic_array(deps), __entry->n_deps))
  );
DEFINE_EVENT(drm_sched_job, drm_sched_job,
-	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity),
-	    TP_ARGS(sched_job, entity)
+	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity,
+		     unsigned int dep_count),
+	    TP_ARGS(sched_job, entity, dep_count)
  );
DEFINE_EVENT(drm_sched_job, drm_run_job,
-	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity),
-	    TP_ARGS(sched_job, entity)
+	    TP_PROTO(struct drm_sched_job *sched_job, struct drm_sched_entity *entity,
+	    	     unsigned int dep_count),
+	    TP_ARGS(sched_job, entity, 0)
  );
TRACE_EVENT(drm_sched_process_job,
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 69bcf0e99d57..0ce3a82fe6fd 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -592,7 +592,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
  	bool first;
  	ktime_t submit_ts;
- trace_drm_sched_job(sched_job, entity);
+	if (trace_drm_sched_job_enabled()) {
+		unsigned long index, n = 0;
+		void *f;
+
+		xa_for_each(&sched_job->dependencies, index, f) { n++; }
+		trace_drm_sched_job(sched_job, entity, n);
+	}
I wonder if it would be good enough to simply trace dependencies one by one in drm_sched_job_add_dependency and drop all the array magic/support code?

In which case that was the drm_sched_job_depdency tracepoint I hinted in my reply to 4/7.

I know you are at v7 but as we are going towards the stable ABI route I am thinking it is worth discussing this option.

We could even leave the separate "wait dep" vs "add dep" and have a set like:

drm_sched_job_add_dependency
drm_sched_job_queue
drm_sched_job_wait_dependency
drm_sched_job_run
drm_sched_job_done

By tracing them one by one we remove the complications from the kernel side and let the userspace parse multiple lines. Shouldn't be a problem for there.

Regards,

Tvrtko

  	atomic_inc(entity->rq->sched->score);
  	WRITE_ONCE(entity->last_user, current->group_leader);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index ad306d3d7282..c645f07ebe26 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1214,7 +1214,7 @@ static void drm_sched_run_job_work(struct work_struct *w)
  	atomic_add(sched_job->credits, &sched->credit_count);
  	drm_sched_job_begin(sched_job);
- trace_drm_run_job(sched_job, entity);
+	trace_drm_run_job(sched_job, entity, 0);
  	fence = sched->ops->run_job(sched_job);
  	complete_all(&entity->entity_idle);
  	drm_sched_fence_scheduled(s_fence, fence);




[Index of Archives]     [Linux Input]     [Video for Linux]     [Gstreamer Embedded]     [Mplayer Users]     [Linux USB Devel]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Yosemite Backpacking]

  Powered by Linux