RE: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9

"Zhu, Jiadong" <Jiadong.Zhu@xxxxxxx> · Wed, 20 Jul 2022 01:46:28 +0000

[AMD Official Use Only - General]

Hi Christian,

There is an imbed project based on xen. One of the guest vm with high priority jobs needs to send preemption against the other vm.
There are some works in other component including umd and qemu, etc. For kmd, we just modify the mcbp related functions to pass the unit test.

Thanks,
Jiadong

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@xxxxxxx>
Sent: Tuesday, July 19, 2022 9:59 PM
To: Zhu, Jiadong <Jiadong.Zhu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Huang, Ray <Ray.Huang@xxxxxxx>; Liu, Aaron <Aaron.Liu@xxxxxxx>
Subject: Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9

Well what's the background for this?

So far MCBP isn't a validated feature, we just added some debugfs interface for testing it.

Regards,
Christian.

Am 19.07.22 um 04:09 schrieb jiadong.zhu@xxxxxxx:
> From: "Jiadong.Zhu" <Jiadong.Zhu@xxxxxxx>
>
> 1. Use unmap_queue package to trigger preemption on gfx9
>     Add trailing fence to track the preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions
>     for the resumed ibs.
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 156 ++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
>   3 files changed, 138 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 82c178a9033a..ca626f0ad7b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
>   #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> +#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
>
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring,
> sched)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5332899642dc..e2c614441691 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
>   static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>                               struct amdgpu_cu_info *cu_info);
>   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
> *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring
> *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume);
>   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
>   static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>                                         void *ras_error_status);
> @@ -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
>                       PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>
>       if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -             amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -             amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -             amdgpu_ring_write(kiq_ring, seq);
> +             amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
> +             amdgpu_ring_write(kiq_ring, 0);
> +             amdgpu_ring_write(kiq_ring, 0);
> +
>       } else {
>               amdgpu_ring_write(kiq_ring, 0);
>               amdgpu_ring_write(kiq_ring, 0);
> @@ -5446,11 +5447,15 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
>
>       control |= ib->length_dw | (vmid << 24);
>
> -     if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
> +     if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags &
> +AMDGPU_IB_FLAG_PREEMPT)) {
>               control |= INDIRECT_BUFFER_PRE_ENB(1);
>
> +             if (flags & AMDGPU_IB_PREEMPTED)
> +                     control |= INDIRECT_BUFFER_PRE_RESUME(1);
> +
>               if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
> -                     gfx_v9_0_ring_emit_de_meta(ring);
> +                     gfx_v9_0_ring_emit_de_meta(ring,
> +                              (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
> +true : false);
>       }
>
>       amdgpu_ring_write(ring, header);
> @@ -5505,6 +5510,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>       bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>       bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>       bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +     bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
>
>       /* RELEASE_MEM - flush caches, send int */
>       amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@
> -5515,6 +5521,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>                                              EOP_TC_WB_ACTION_EN |
>                                              EOP_TC_MD_ACTION_EN)) |
>                                EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> +                              (exec ? EOP_EXEC : 0x0) |
>                                EVENT_INDEX(5)));
>       amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) |
> INT_SEL(int_sel ? 2 : 0));
>
> @@ -5620,33 +5627,132 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
>       amdgpu_ring_write(ring, 0);
>   }
>
> -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +     struct amdgpu_device *adev = ring->adev;
>       struct v9_ce_ib_state ce_payload = {0};
> -     uint64_t csa_addr;
> +     uint64_t offset, ce_payload_gpu_addr;
> +     void *ce_payload_cpu_addr;
>       int cnt;
>
>       cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
> -     csa_addr = amdgpu_csa_vaddr(ring->adev);
> +
> +     if (ring->is_mes_queue) {
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gfx_meta_data) +
> +                     offsetof(struct v9_gfx_meta_data, ce_payload);
> +             ce_payload_gpu_addr =
> +                     amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +             ce_payload_cpu_addr =
> +                     amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +     } else {
> +             offset = offsetof(struct v9_gfx_meta_data, ce_payload);
> +             ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +             ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +     }
>
>       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
>       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
>                                WRITE_DATA_DST_SEL(8) |
>                                WR_CONFIRM) |
>                                WRITE_DATA_CACHE_POLICY(0));
> -     amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -     amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -     amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
> +     amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
> +     amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
> +
> +     if (resume)
> +             amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
> +                                        sizeof(ce_payload) >> 2);
> +     else
> +             amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
> +                                        sizeof(ce_payload) >> 2);
> +}
> +
> +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) {
> +     int i, r = 0;
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +     struct amdgpu_ring *kiq_ring = &kiq->ring;
> +     unsigned long flags;
> +
> +     if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> +             return -EINVAL;
> +
> +     spin_lock_irqsave(&kiq->ring_lock, flags);
> +
> +     if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
> +             spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +             return -ENOMEM;
> +     }
> +
> +     /* assert preemption condition */
> +     amdgpu_ring_set_preempt_cond_exec(ring, false);
> +
> +     ring->trail_seq += 1;
> +     amdgpu_ring_alloc(ring, 8);
> +     gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> +                               ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +     /* assert IB preemption, emit the trailing fence */
> +     kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
> +                                ring->trail_fence_gpu_addr,
> +                                ring->trail_seq);
> +
> +     amdgpu_ring_commit(kiq_ring);
> +     spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +     /* poll the trailing fence */
> +     for (i = 0; i < adev->usec_timeout; i++) {
> +             if (ring->trail_seq ==
> +                 le32_to_cpu(*(ring->trail_fence_cpu_addr)))
> +                     break;
> +             udelay(1);
> +     }
> +
> +     if (i >= adev->usec_timeout) {
> +             r = -EINVAL;
> +             DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
> +     }
> +
> +     amdgpu_ring_commit(ring);
> +     /*reset the CP_VMID_PREEMPT after trailing fence*/
> +     WREG32_SOC15(GC, 0, mmCP_VMID_PREEMPT, 0x0);
> +
> +     /* deassert preemption condition */
> +     amdgpu_ring_set_preempt_cond_exec(ring, true);
> +     return r;
>   }
>
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +     struct amdgpu_device *adev = ring->adev;
>       struct v9_de_ib_state de_payload = {0};
> -     uint64_t csa_addr, gds_addr;
> +     uint64_t offset, gds_addr, de_payload_gpu_addr;
> +     void *de_payload_cpu_addr;
>       int cnt;
>
> -     csa_addr = amdgpu_csa_vaddr(ring->adev);
> -     gds_addr = csa_addr + 4096;
> +     if (ring->is_mes_queue) {
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gfx_meta_data) +
> +                     offsetof(struct v9_gfx_meta_data, de_payload);
> +             de_payload_gpu_addr =
> +                     amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +             de_payload_cpu_addr =
> +                     amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +
> +             offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                               gfx[0].gds_backup) +
> +                     offsetof(struct v9_gfx_meta_data, de_payload);
> +             gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +     } else {
> +             offset = offsetof(struct v9_gfx_meta_data, de_payload);
> +             de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +             de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +
> +             gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
> +                              AMDGPU_CSA_SIZE - adev->gds.gds_size,
> +                              PAGE_SIZE);
> +     }
> +
>       de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
>       de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
>
> @@ -5656,9 +5762,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>                                WRITE_DATA_DST_SEL(8) |
>                                WR_CONFIRM) |
>                                WRITE_DATA_CACHE_POLICY(0));
> -     amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -     amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -     amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
> +     amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
> +     amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
> +
> +     if (resume)
> +             amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
> +                                        sizeof(de_payload) >> 2);
> +     else
> +             amdgpu_ring_write_multiple(ring, (void *)&de_payload,
> +                                        sizeof(de_payload) >> 2);
>   }
>
>   static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring,
> bool start, @@ -5674,8 +5786,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>       uint32_t dw2 = 0;
>
> -     if (amdgpu_sriov_vf(ring->adev))
> -             gfx_v9_0_ring_emit_ce_meta(ring);
> +     if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp)
> +             gfx_v9_0_ring_emit_ce_meta(ring,
> +                                 (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED)
> +? true : false);
>
>       dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>       if (flags & AMDGPU_HAVE_CTX_SWITCH) { @@ -7024,6 +7137,7 @@ static
> const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>       .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>       .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>       .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +     .preempt_ib = gfx_v9_0_ring_preempt_ib,
>       .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>       .emit_wreg = gfx_v9_0_ring_emit_wreg,
>       .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, diff --git
> a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 799925d22fc8..614e9f8467fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -162,6 +162,7 @@
>                * 2 - Bypass
>                */
>   #define     INDIRECT_BUFFER_PRE_ENB(x)               ((x) << 21)
> +#define     INDIRECT_BUFFER_PRE_RESUME(x)           ((x) << 30)
>   #define     PACKET3_COPY_DATA                               0x40
>   #define     PACKET3_PFP_SYNC_ME                             0x42
>   #define     PACKET3_COND_WRITE                              0x45
> @@ -184,6 +185,7 @@
>   #define             EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
>   #define             EOP_TC_NC_ACTION_EN                     (1 << 19)
>   #define             EOP_TC_MD_ACTION_EN                     (1 << 21) /* L2 metadata */
> +#define              EOP_EXEC                                        (1 << 28) /* For Trailing Fence */
>
>   #define             DATA_SEL(x)                             ((x) << 29)
>               /* 0 - discard