[Public] Hi Jingwen, Your patch has caused amdgpu driver load failure on all ASICs. Please revert it first and come up with a proper fix. Regards, Guchun -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Andrey Grodzovsky Sent: Wednesday, August 11, 2021 12:41 AM To: Chen, JingWen <JingWen.Chen2@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Liu, Monk <Monk.Liu@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Jack Zhang <Jack.Zhang1@xxxxxxx>; Jack Zhang <Jack.Zhang7@xxxxxxxxxxx> Subject: Re: [PATCH v4] drm/amd/amdgpu embed hw_fence into amdgpu_job Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> Andrey On 2021-08-09 11:22 p.m., Jingwen Chen wrote: > From: Jack Zhang <Jack.Zhang1@xxxxxxx> > > Why: Previously hw fence is alloced separately with job. > It caused historical lifetime issues and corner cases. > The ideal situation is to take fence to manage both job and fence's > lifetime, and simplify the design of gpu-scheduler. > > How: > We propose to embed hw_fence into amdgpu_job. > 1. We cover the normal job submission by this method. > 2. For ib_test, and submit without a parent job keep the legacy way to > create a hw fence separately. > v2: > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is > embeded in a job. > v3: > remove redundant variable ring in amdgpu_job > v4: > add tdr sequence support for this feature. Add a job_run_counter to > indicate whether this job is a resubmit job. > > Signed-off-by: Jingwen Chen <Jingwen.Chen2@xxxxxxx> > Signed-off-by: Jack Zhang <Jack.Zhang7@xxxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 1 - > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 73 ++++++++++++++++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 39 +++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 6 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 5 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- > 9 files changed, 108 insertions(+), 34 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 7b46ba551cb2..3003ee1c9487 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, > ret = dma_fence_wait(f, false); > > err_ib_sched: > - dma_fence_put(f); > amdgpu_job_free(job); > err: > return ret; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 536005bff24a..277128846dd1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring) > continue; > } > job = to_amdgpu_job(s_job); > - if (preempted && job->fence == fence) > + if (preempted && (&job->hw_fence) == fence) > /* mark the job as preempted */ > job->preemption_status |= AMDGPU_IB_PREEMPTED; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 9e53ff851496..ade2fa07a50a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) > int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > struct amdgpu_reset_context *reset_context) > { > - int i, r = 0; > + int i, j, r = 0; > struct amdgpu_job *job = NULL; > bool need_full_reset = > test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ > -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > if (!ring || !ring->sched.thread) > continue; > > + /*clear job fence from fence drv to avoid force_completion > + *leave NULL and vm flush fence in fence drv */ > + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { > + struct dma_fence *old,**ptr; > + ptr = &ring->fence_drv.fences[j]; > + old = rcu_dereference_protected(*ptr, 1); > + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { > + RCU_INIT_POINTER(*ptr, NULL); > + } > + } > /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ > amdgpu_fence_driver_force_completion(ring); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 7495911516c2..a8302e324110 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -129,30 +129,50 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring) > * > * @ring: ring the fence is associated with > * @f: resulting fence object > + * @job: job the fence is embeded in > * @flags: flags to pass into the subordinate .emit_fence() call > * > * Emits a fence command on the requested ring (all asics). > * Returns 0 on success, -ENOMEM on failure. > */ > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, > +struct amdgpu_job *job, > unsigned flags) > { > struct amdgpu_device *adev = ring->adev; > - struct amdgpu_fence *fence; > + struct dma_fence *fence; > + struct amdgpu_fence *am_fence; > struct dma_fence __rcu **ptr; > uint32_t seq; > int r; > > - fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL); > - if (fence == NULL) > - return -ENOMEM; > + if (job == NULL) { > + /* create a sperate hw fence */ > + am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC); > + if (am_fence == NULL) > + return -ENOMEM; > + fence = &am_fence->base; > + am_fence->ring = ring; > + } else { > + /* take use of job-embedded fence */ > + fence = &job->hw_fence; > + } > > seq = ++ring->fence_drv.sync_seq; > - fence->ring = ring; > - dma_fence_init(&fence->base, &amdgpu_fence_ops, > - &ring->fence_drv.lock, > - adev->fence_context + ring->idx, > - seq); > + if (job != NULL && job->job_run_counter) { > + /* reinit seq for resubmitted jobs */ > + fence->seqno = seq; > + } else { > + dma_fence_init(fence, &amdgpu_fence_ops, > + &ring->fence_drv.lock, > + adev->fence_context + ring->idx, > + seq); > + } > + > + if (job != NULL) { > + /* mark this fence has a parent job */ > + set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags); > + } > + > amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > seq, flags | AMDGPU_FENCE_FLAG_INT); > pm_runtime_get_noresume(adev_to_drm(adev)->dev); > @@ -175,9 +195,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, > /* This function can't be called concurrently anyway, otherwise > * emitting the fence would mess up the hardware ring buffer. > */ > - rcu_assign_pointer(*ptr, dma_fence_get(&fence->base)); > + rcu_assign_pointer(*ptr, dma_fence_get(fence)); > > - *f = &fence->base; > + *f = fence; > > return 0; > } > @@ -621,8 +641,16 @@ static const char > *amdgpu_fence_get_driver_name(struct dma_fence *fence) > > static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f) > { > - struct amdgpu_fence *fence = to_amdgpu_fence(f); > - return (const char *)fence->ring->name; > + struct amdgpu_ring *ring; > + > + if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) { > + struct amdgpu_job *job = container_of(f, struct amdgpu_job, > +hw_fence); > + > + ring = to_amdgpu_ring(job->base.sched); > + } else { > + ring = to_amdgpu_fence(f)->ring; > + } > + return (const char *)ring->name; > } > > /** > @@ -656,8 +684,20 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f) > static void amdgpu_fence_free(struct rcu_head *rcu) > { > struct dma_fence *f = container_of(rcu, struct dma_fence, rcu); > - struct amdgpu_fence *fence = to_amdgpu_fence(f); > - kmem_cache_free(amdgpu_fence_slab, fence); > + > + if (test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &f->flags)) { > + /* free job if fence has a parent job */ > + struct amdgpu_job *job; > + > + job = container_of(f, struct amdgpu_job, hw_fence); > + kfree(job); > + } else { > + /* free fence_slab if it's separated fence*/ > + struct amdgpu_fence *fence; > + > + fence = to_amdgpu_fence(f); > + kmem_cache_free(amdgpu_fence_slab, fence); > + } > } > > /** > @@ -680,6 +720,7 @@ static const struct dma_fence_ops amdgpu_fence_ops = { > .release = amdgpu_fence_release, > }; > > + > /* > * Fence debugfs > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > index ec65ab0ddf89..c076a6b9a5a2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > @@ -262,7 +262,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > fence_flags | AMDGPU_FENCE_FLAG_64BIT); > } > > - r = amdgpu_fence_emit(ring, f, fence_flags); > + r = amdgpu_fence_emit(ring, f, job, fence_flags); > if (r) { > dev_err(adev->dev, "failed to emit fence (%d)\n", r); > if (job && job->vmid) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index d33e6d97cc89..7da9c1855bd0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -127,11 +127,16 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) > { > struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); > struct dma_fence *f; > + struct dma_fence *hw_fence; > unsigned i; > > - /* use sched fence if available */ > - f = job->base.s_fence ? &job->base.s_fence->finished : job->fence; > + if (job->hw_fence.ops == NULL) > + hw_fence = job->external_hw_fence; > + else > + hw_fence = &job->hw_fence; > > + /* use sched fence if available */ > + f = job->base.s_fence ? &job->base.s_fence->finished : hw_fence; > for (i = 0; i < job->num_ibs; ++i) > amdgpu_ib_free(ring->adev, &job->ibs[i], f); > } > @@ -142,20 +147,27 @@ static void amdgpu_job_free_cb(struct > drm_sched_job *s_job) > > drm_sched_job_cleanup(s_job); > > - dma_fence_put(job->fence); > amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->sched_sync); > - kfree(job); > + > + /* only put the hw fence if has embedded fence */ > + if (job->hw_fence.ops != NULL) > + dma_fence_put(&job->hw_fence); > + else > + kfree(job); > } > > void amdgpu_job_free(struct amdgpu_job *job) > { > amdgpu_job_free_resources(job); > - > - dma_fence_put(job->fence); > amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->sched_sync); > - kfree(job); > + > + /* only put the hw fence if has embedded fence */ > + if (job->hw_fence.ops != NULL) > + dma_fence_put(&job->hw_fence); > + else > + kfree(job); > } > > int amdgpu_job_submit(struct amdgpu_job *job, struct > drm_sched_entity *entity, @@ -184,11 +196,14 @@ int > amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring > *ring, > > job->base.sched = &ring->sched; > r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence); > - job->fence = dma_fence_get(*fence); > + /* record external_hw_fence for direct submit */ > + job->external_hw_fence = dma_fence_get(*fence); > if (r) > return r; > > amdgpu_job_free(job); > + dma_fence_put(*fence); > + > return 0; > } > > @@ -246,10 +261,12 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) > if (r) > DRM_ERROR("Error scheduling IBs (%d)\n", r); > } > - /* if gpu reset, hw fence will be replaced here */ > - dma_fence_put(job->fence); > - job->fence = dma_fence_get(fence); > > + if (!job->job_run_counter) > + dma_fence_get(fence); > + else if (finished->error < 0) > + dma_fence_put(&job->hw_fence); > + job->job_run_counter ++; > amdgpu_job_free_resources(job); > > fence = r ? ERR_PTR(r) : fence; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > index 81caac9b958a..9e65730193b8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > @@ -46,7 +46,8 @@ struct amdgpu_job { > struct amdgpu_sync sync; > struct amdgpu_sync sched_sync; > struct amdgpu_ib *ibs; > - struct dma_fence *fence; /* the hw fence */ > + struct dma_fence hw_fence; > + struct dma_fence *external_hw_fence; > uint32_t preamble_status; > uint32_t preemption_status; > uint32_t num_ibs; > @@ -62,6 +63,9 @@ struct amdgpu_job { > /* user fence handling */ > uint64_t uf_addr; > uint64_t uf_sequence; > + > + /* job_run_counter >= 1 means a resubmit job */ > + uint32_t job_run_counter; > }; > > int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 9c11ced4312c..03d4b29a76d6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -48,6 +48,9 @@ > #define AMDGPU_FENCE_FLAG_INT (1 << 1) > #define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) > > +/* fence flag bit to indicate the face is embeded in job*/ > +#define AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT (DMA_FENCE_FLAG_USER_BITS + 1) > + > #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, > sched) > > #define AMDGPU_IB_POOL_SIZE (1024 * 1024) > @@ -118,7 +121,7 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev); > void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev); > int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev); > void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev); -int > amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence > +**fence, struct amdgpu_job *job, > unsigned flags); > int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, > uint32_t timeout); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 2a88ed5d983b..2af8860d74cc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1218,7 +1218,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, > amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid); > > if (vm_flush_needed || pasid_mapping_needed) { > - r = amdgpu_fence_emit(ring, &fence, 0); > + r = amdgpu_fence_emit(ring, &fence, NULL, 0); > if (r) > return r; > }