Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





Am 06.08.21 um 07:52 schrieb Jingwen Chen:
On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
From: Jack Zhang <Jack.Zhang1@xxxxxxx>

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen <Jingwen.Chen2@xxxxxxx>
Signed-off-by: Jack Zhang <Jack.Zhang7@xxxxxxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 ++++++++++++++++-----
   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c      |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 ++++++++----
   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  4 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  5 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
   8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
   	ret = dma_fence_wait(f, false);
   err_ib_sched:
-	dma_fence_put(f);
   	amdgpu_job_free(job);
   err:
   	return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
   			continue;
   		}
   		job = to_amdgpu_job(s_job);
-		if (preempted && job->fence == fence)
+		if (preempted && (&job->hw_fence) == fence)
   			/* mark the job as preempted */
   			job->preemption_status |= AMDGPU_IB_PREEMPTED;
   	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
    *
    * @ring: ring the fence is associated with
    * @f: resulting fence object
+ * @job: job the fence is embeded in
    * @flags: flags to pass into the subordinate .emit_fence() call
    *
    * Emits a fence command on the requested ring (all asics).
    * Returns 0 on success, -ENOMEM on failure.
    */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amdgpu_job *job,
   		      unsigned flags)
   {
   	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_fence *fence;
+	struct dma_fence *fence;
+	struct amdgpu_fence *am_fence;
   	struct dma_fence __rcu **ptr;
   	uint32_t seq;
   	int r;
-	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
-	if (fence == NULL)
-		return -ENOMEM;
+	if (job == NULL) {
+		/* create a sperate hw fence */
+		am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+		if (am_fence == NULL)
+			return -ENOMEM;
+		fence = &am_fence->base;
+		am_fence->ring = ring;
+	} else {
+		/* take use of job-embedded fence */
+		fence = &job->hw_fence;
+		job->ring = ring;

If you would make hw_fence of type amdgpu_fence
you could probably avoid the special job->ring = ring
See more in related comment at the bottom

Hi Andry,

I'm only make the amdgpu_fence for the fence without job parameter
provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.

When you have the job and need the ring you can just do conatiner_of(job->sched, struct amdgpu_ring, sched).

No need for an extra ring pointer here.

Regards,
Christian.


Regards,
Jingwen Chen
+	}
   	seq = ++ring->fence_drv.sync_seq;
-	fence->ring = ring;
-	dma_fence_init(&fence->base, &amdgpu_fence_ops,
+	dma_fence_init(fence, &amdgpu_fence_ops,
   		       &ring->fence_drv.lock,
   		       adev->fence_context + ring->idx,
   		       seq);
+
+	if (job != NULL) {
+		/* mark this fence has a parent job */
+		set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
+	}
+
   	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   			       seq, flags | AMDGPU_FENCE_FLAG_INT);
   	pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
   	/* This function can't be called concurrently anyway, otherwise
   	 * emitting the fence would mess up the hardware ring buffer.
   	 */
-	rcu_assign_pointer(*ptr, dma_fence_get(&fence->base));
+	rcu_assign_pointer(*ptr, dma_fence_get(fence));
-	*f = &fence->base;
+	*f = fence;
   	return 0;
   }
@@ -621,8 +637,16 @@ static const char *amdgpu_fence_get_driver_name(struct dma_fence *fence)
   static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
   {
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	return (const char *)fence->ring->name;
+	struct amdgpu_ring *ring;
+
+	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
+		struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
+
+		ring = job->ring;
+	} else {
+		ring = to_amdgpu_fence(f)->ring;
+	}

Same as above


+	return (const char *)ring->name;
   }
   /**
@@ -656,8 +680,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
   static void amdgpu_fence_free(struct rcu_head *rcu)
   {
   	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	kmem_cache_free(amdgpu_fence_slab, fence);
+
+	if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) {
+	/* free job if fence has a parent job */
+		struct amdgpu_job *job;
+
+		job = container_of(f, struct amdgpu_job, hw_fence);
+		kfree(job);
+	} else {
+	/* free fence_slab if it's separated fence*/
+		struct amdgpu_fence *fence;
+
+		fence = to_amdgpu_fence(f);
+		kmem_cache_free(amdgpu_fence_slab, fence);
+	}
   }
   /**
@@ -680,6 +716,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = {
   	.release = amdgpu_fence_release,
   };
+
   /*
    * Fence debugfs
    */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ec65ab0ddf89..c076a6b9a5a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
   				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
   	}
-	r = amdgpu_fence_emit(ring, f, fence_flags);
+	r = amdgpu_fence_emit(ring, f, job, fence_flags);
   	if (r) {
   		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
   		if (job && job->vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index d33e6d97cc89..65a395060de2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
   {
   	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
   	struct dma_fence *f;
+	struct dma_fence *hw_fence;
   	unsigned i;
-	/* use sched fence if available */
-	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
+	if (job->hw_fence.ops == NULL)
+		hw_fence = job->external_hw_fence;
+	else
+		hw_fence = &job->hw_fence;
+	/* use sched fence if available */
+	f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence;
   	for (i = 0; i < job->num_ibs; ++i)
   		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
   }
@@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
   	drm_sched_job_cleanup(s_job);
-	dma_fence_put(job->fence);
   	amdgpu_sync_free(&job->sync);
   	amdgpu_sync_free(&job->sched_sync);
-	kfree(job);
+
+    /* only put the hw fence if has embedded fence */
+	if (job->hw_fence.ops != NULL)
+		dma_fence_put(&job->hw_fence);
+	else
+		kfree(job);
   }
   void amdgpu_job_free(struct amdgpu_job *job)
   {
   	amdgpu_job_free_resources(job);
-
-	dma_fence_put(job->fence);
   	amdgpu_sync_free(&job->sync);
   	amdgpu_sync_free(&job->sched_sync);
-	kfree(job);
+
+	/* only put the hw fence if has embedded fence */
+	if (job->hw_fence.ops != NULL)
+		dma_fence_put(&job->hw_fence);
+	else
+		kfree(job);
   }
   int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
@@ -184,11 +196,14 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
   	job->base.sched = &ring->sched;
   	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
-	job->fence = dma_fence_get(*fence);
+	/* record external_hw_fence for direct submit */
+	job->external_hw_fence = dma_fence_get(*fence);
   	if (r)
   		return r;
   	amdgpu_job_free(job);
+	dma_fence_put(*fence);
+
   	return 0;
   }
@@ -246,10 +261,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
   		if (r)
   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
   	}
-	/* if gpu reset, hw fence will be replaced here */
-	dma_fence_put(job->fence);
-	job->fence = dma_fence_get(fence);
+	dma_fence_get(fence);
   	amdgpu_job_free_resources(job);
   	fence = r ? ERR_PTR(r) : fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 81caac9b958a..92324c978534 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -46,7 +46,9 @@ struct amdgpu_job {
   	struct amdgpu_sync	sync;
   	struct amdgpu_sync	sched_sync;
   	struct amdgpu_ib	*ibs;
-	struct dma_fence	*fence; /* the hw fence */
+	struct dma_fence	hw_fence;
+	struct amdgpu_ring *ring;
Why not instead of 2 fields above just embed   struct amdgpu_fence as
hw_fence  and by this save the extra 'ring' field handling ?

Andrey


+	struct dma_fence	*external_hw_fence;
   	uint32_t		preamble_status;
   	uint32_t                preemption_status;
   	uint32_t		num_ibs;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 9c11ced4312c..03d4b29a76d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -48,6 +48,9 @@
   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
+/* fence flag bit to indicate the face is embeded in job*/
+#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT		(DMA_FENCE_FLAG_USER_BITS + 1)
+
   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
   #define AMDGPU_IB_POOL_SIZE	(1024 * 1024)
@@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
   void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
   int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, struct amdgpu_job *job,
   		      unsigned flags);
   int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
   			      uint32_t timeout);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 2a88ed5d983b..2af8860d74cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
   		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
   	if (vm_flush_needed || pasid_mapping_needed) {
-		r = amdgpu_fence_emit(ring, &fence, 0);
+		r = amdgpu_fence_emit(ring, &fence, NULL, 0);
   		if (r)
   			return r;
   	}




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux