[AMD Official Use Only - General] Hi Christian, The details as follows: > 1. Use unmap_queue package to trigger preemption on gfx9 > Add trailing fence to track the preemption done. On gfx9, there is no single package to complete the mcbp request in a single frame like gfx10 does. To send preemption on gfx9, kmd needs to: 1. emit a trailing fence on gfx ring, do not update the wptr to cp. 2. emit a write_reg to reset mmCP_VMID_PREEMPT after the trailing fence. 3. send unmap_queue to kiq ring with field rb_wptr which is the offset of trailing fence on gfx ring. When cp fw receives the unmap_queue in mec, it will: 1. Store mmCP_RB0_WPTR from rb_wptr to kick GFX RB off. 2. write mmCP_VMID_PREEMPT as 0xffff to request preemption on all vmids. Then wait on mmCP_VMID_PREEMPT to become 0x0 indicating the preemption is complete. 3. the rest of pipeline would do the preemption according to the mmCP_VMID_PREEMPT until it hits the trailing fence. 4. after the trailing fence is signaled, the write_reg to reset mmCP_VMID_PREEMPT unblocks the unmap_queue package to proceed. The unmap_queue on gfx9 using rb_wptr is referred from the doc cp_packages_rn: UNMAP_QUEUES DW| Bits | Field | Description 4b | 19:0 | rb_wptr | If ((engine_sel = 4) and (action = 3)) then preempted GFX queue’s new RB pointer. 2. Modify emit_ce_meta emit_de_meta functions > for the resumed ibs. For preemption enabled ibs, kmd add preamble ib(ce/de meta) to initialize csa data before send the main ib. The csa is used to save/restore ib execution infos when preemption/resubmit happens. KMD is responsible to extract the content from CSA during re-submission of a previously pre-empted DMA frame. The patch is to write csa data for resubmit ibs with previous preempted ib's csa. Thanks, Jiadong -----Original Message----- From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Sent: Friday, August 12, 2022 7:39 PM To: Zhu, Jiadong <Jiadong.Zhu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Huang, Ray <Ray.Huang@xxxxxxx>; Liu, Aaron <Aaron.Liu@xxxxxxx> Subject: Re: [PATCH] drm/amdgpu: modify mcbp implement for gfx9(v3) [CAUTION: External Email] Am 11.08.22 um 05:19 schrieb jiadong.zhu@xxxxxxx: > From: "Jiadong.Zhu" <Jiadong.Zhu@xxxxxxx> > > 1. Use unmap_queue package to trigger preemption on gfx9 > Add trailing fence to track the preemption done. > 2. Modify emit_ce_meta emit_de_meta functions > for the resumed ibs. > > Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 161 ++++++++++++++++++++--- > drivers/gpu/drm/amd/amdgpu/soc15d.h | 2 + > 3 files changed, 143 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 82c178a9033a..ca626f0ad7b1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level { > #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) > #define AMDGPU_FENCE_FLAG_INT (1 << 1) > #define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) > +#define AMDGPU_FENCE_FLAG_EXEC (1 << 3) Ok, that here needs much more explanation why you need it and how all this is supposed to work? Regards, Christian. > > #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, > sched) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 5332899642dc..887021fd56aa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev); > static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, > struct amdgpu_cu_info *cu_info); > static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device > *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring > *ring); > +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool > +resume); > static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); > static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, > void *ras_error_status); @@ > -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring, > > PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); > > if (action == PREEMPT_QUEUES_NO_UNMAP) { > - amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr)); > - amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr)); > - amdgpu_ring_write(kiq_ring, seq); > + amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask)); > + amdgpu_ring_write(kiq_ring, 0); > + amdgpu_ring_write(kiq_ring, 0); > + > } else { > amdgpu_ring_write(kiq_ring, 0); > amdgpu_ring_write(kiq_ring, 0); @@ -5446,11 +5447,16 @@ > static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, > > control |= ib->length_dw | (vmid << 24); > > - if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) { > + if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags & > + AMDGPU_IB_FLAG_PREEMPT)) { > control |= INDIRECT_BUFFER_PRE_ENB(1); > > + if (flags & AMDGPU_IB_PREEMPTED) > + control |= INDIRECT_BUFFER_PRE_RESUME(1); > + > if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid) > - gfx_v9_0_ring_emit_de_meta(ring); > + gfx_v9_0_ring_emit_de_meta(ring, > + (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? > + true : false); > } > > amdgpu_ring_write(ring, header); @@ -5505,6 +5511,7 @@ static > void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, > bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; > bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; > bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY; > + bool exec = flags & AMDGPU_FENCE_FLAG_EXEC; > > /* RELEASE_MEM - flush caches, send int */ > amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@ > -5515,6 +5522,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, > EOP_TC_WB_ACTION_EN | > EOP_TC_MD_ACTION_EN)) | > > EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | > + (exec ? EOP_EXEC : 0x0) | > EVENT_INDEX(5))); > amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | > INT_SEL(int_sel ? 2 : 0)); > > @@ -5620,33 +5628,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring) > amdgpu_ring_write(ring, 0); > } > > -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring) > +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool > +resume) > { > + struct amdgpu_device *adev = ring->adev; > struct v9_ce_ib_state ce_payload = {0}; > - uint64_t csa_addr; > + uint64_t offset, ce_payload_gpu_addr; > + void *ce_payload_cpu_addr; > int cnt; > > cnt = (sizeof(ce_payload) >> 2) + 4 - 2; > - csa_addr = amdgpu_csa_vaddr(ring->adev); > + > + if (ring->is_mes_queue) { > + offset = offsetof(struct amdgpu_mes_ctx_meta_data, > + gfx[0].gfx_meta_data) + > + offsetof(struct v9_gfx_meta_data, ce_payload); > + ce_payload_gpu_addr = > + amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset); > + ce_payload_cpu_addr = > + amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset); > + } else { > + offset = offsetof(struct v9_gfx_meta_data, ce_payload); > + ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset; > + ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset; > + } > > amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt)); > amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) | > WRITE_DATA_DST_SEL(8) | > WR_CONFIRM) | > WRITE_DATA_CACHE_POLICY(0)); > - amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload))); > - amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload))); > - amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2); > + amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr)); > + amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr)); > + > + if (resume) > + amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr, > + sizeof(ce_payload) >> 2); > + else > + amdgpu_ring_write_multiple(ring, (void *)&ce_payload, > + sizeof(ce_payload) >> 2); } > + > +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) { > + int i, r = 0; > + struct amdgpu_device *adev = ring->adev; > + struct amdgpu_kiq *kiq = &adev->gfx.kiq; > + struct amdgpu_ring *kiq_ring = &kiq->ring; > + unsigned long flags; > + > + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > + return -EINVAL; > + > + spin_lock_irqsave(&kiq->ring_lock, flags); > + > + if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) { > + spin_unlock_irqrestore(&kiq->ring_lock, flags); > + return -ENOMEM; > + } > + > + /* assert preemption condition */ > + amdgpu_ring_set_preempt_cond_exec(ring, false); > + > + ring->trail_seq += 1; > + amdgpu_ring_alloc(ring, 13); > + gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr, > + ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC); > + /*reset the CP_VMID_PREEMPT after trailing fence*/ > + amdgpu_ring_emit_wreg(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT), > + 0x0); > + > + /* assert IB preemption, emit the trailing fence */ > + kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP, > + ring->trail_fence_gpu_addr, > + ring->trail_seq); > + > + amdgpu_ring_commit(kiq_ring); > + spin_unlock_irqrestore(&kiq->ring_lock, flags); > + > + /* poll the trailing fence */ > + for (i = 0; i < adev->usec_timeout; i++) { > + if (ring->trail_seq == > + le32_to_cpu(*(ring->trail_fence_cpu_addr))) > + break; > + udelay(1); > + } > + > + if (i >= adev->usec_timeout) { > + r = -EINVAL; > + DRM_ERROR("ring %d failed to preempt ib\n", ring->idx); > + } > + > + amdgpu_ring_commit(ring); > + > + /* deassert preemption condition */ > + amdgpu_ring_set_preempt_cond_exec(ring, true); > + return r; > } > > -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) > +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool > +resume) > { > + struct amdgpu_device *adev = ring->adev; > struct v9_de_ib_state de_payload = {0}; > - uint64_t csa_addr, gds_addr; > + uint64_t offset, gds_addr, de_payload_gpu_addr; > + void *de_payload_cpu_addr; > int cnt; > > - csa_addr = amdgpu_csa_vaddr(ring->adev); > - gds_addr = csa_addr + 4096; > + if (ring->is_mes_queue) { > + offset = offsetof(struct amdgpu_mes_ctx_meta_data, > + gfx[0].gfx_meta_data) + > + offsetof(struct v9_gfx_meta_data, de_payload); > + de_payload_gpu_addr = > + amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset); > + de_payload_cpu_addr = > + amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset); > + > + offset = offsetof(struct amdgpu_mes_ctx_meta_data, > + gfx[0].gds_backup) + > + offsetof(struct v9_gfx_meta_data, de_payload); > + gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset); > + } else { > + offset = offsetof(struct v9_gfx_meta_data, de_payload); > + de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset; > + de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset; > + > + gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) + > + AMDGPU_CSA_SIZE - adev->gds.gds_size, > + PAGE_SIZE); > + } > + > de_payload.gds_backup_addrlo = lower_32_bits(gds_addr); > de_payload.gds_backup_addrhi = upper_32_bits(gds_addr); > > @@ -5656,9 +5766,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) > WRITE_DATA_DST_SEL(8) | > WR_CONFIRM) | > WRITE_DATA_CACHE_POLICY(0)); > - amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload))); > - amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload))); > - amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2); > + amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr)); > + amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr)); > + > + if (resume) > + amdgpu_ring_write_multiple(ring, de_payload_cpu_addr, > + sizeof(de_payload) >> 2); > + else > + amdgpu_ring_write_multiple(ring, (void *)&de_payload, > + sizeof(de_payload) >> 2); > } > > static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, > bool start, @@ -5674,8 +5790,10 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) > { > uint32_t dw2 = 0; > > - if (amdgpu_sriov_vf(ring->adev)) > - gfx_v9_0_ring_emit_ce_meta(ring); > + if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) > + gfx_v9_0_ring_emit_ce_meta(ring, > + (!amdgpu_sriov_vf(ring->adev) && > + flags & > + AMDGPU_IB_PREEMPTED) ? true : false); > > dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ > if (flags & AMDGPU_HAVE_CTX_SWITCH) { @@ -7024,6 +7142,7 @@ > static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { > .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl, > .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec, > .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec, > + .preempt_ib = gfx_v9_0_ring_preempt_ib, > .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl, > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, diff --git > a/drivers/gpu/drm/amd/amdgpu/soc15d.h > b/drivers/gpu/drm/amd/amdgpu/soc15d.h > index 799925d22fc8..614e9f8467fb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h > +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h > @@ -162,6 +162,7 @@ > * 2 - Bypass > */ > #define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21) > +#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30) > #define PACKET3_COPY_DATA 0x40 > #define PACKET3_PFP_SYNC_ME 0x42 > #define PACKET3_COND_WRITE 0x45 > @@ -184,6 +185,7 @@ > #define EOP_TC_ACTION_EN (1 << 17) /* L2 */ > #define EOP_TC_NC_ACTION_EN (1 << 19) > #define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */ > +#define EOP_EXEC (1 << 28) /* For Trailing Fence */ > > #define DATA_SEL(x) ((x) << 29) > /* 0 - discard