RE: [PATCH 06/11] drm/amdgpu: fix and cleanup gmc_v9_0_flush_gpu_tlb_pasid

"Deucher, Alexander" <Alexander.Deucher@xxxxxxx> · Tue, 5 Sep 2023 22:45:00 +0000



[Public]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
> Christian König
> Sent: Tuesday, September 5, 2023 2:04 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Sharma, Shashank <Shashank.Sharma@xxxxxxx>
> Subject: [PATCH 06/11] drm/amdgpu: fix and cleanup
> gmc_v9_0_flush_gpu_tlb_pasid
>
> Testing for reset is pointless since the reset can start right after the test.
>
> The same PASID can be used by more than one VMID, reset each of them.
>
> Move the KIQ and all the workaround handling into common GMC code.
>
> Signed-off-by: Christian König <christian.koenig@xxxxxxx>

Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  60 +++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  10 ++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 109 ++++++++----------------
>  3 files changed, 102 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 857051093900..b5f1a1218725 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -32,6 +32,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_gmc.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>  #include "amdgpu_xgmi.h"
>
>  #include <drm/drm_drv.h>
> @@ -623,6 +624,65 @@ void amdgpu_gmc_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
>       DRM_ERROR("Error flushing GPU TLB using the SDMA (%d)!\n", r);  }
>
> +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
> uint16_t pasid,
> +                                uint32_t flush_type, bool all_hub,
> +                                uint32_t inst)
> +{
> +     u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT
> :
> +             adev->usec_timeout;
> +     struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
> +     struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
> +     unsigned int ndw;
> +     signed long r;
> +     uint32_t seq;
> +
> +     if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> +         !down_read_trylock(&adev->reset_domain->sem)) {
> +             return adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev,
> pasid,
> +                                                             flush_type,
> +                                                             all_hub, inst);
> +     }
> +
> +     /* 2 dwords flush + 8 dwords fence */
> +     ndw = kiq->pmf->invalidate_tlbs_size + 8;
> +
> +     if (adev->gmc.flush_tlb_needs_extra_type_2)
> +             ndw += kiq->pmf->invalidate_tlbs_size;
> +
> +     if (adev->gmc.flush_tlb_needs_extra_type_0)
> +             ndw += kiq->pmf->invalidate_tlbs_size;
> +
> +     spin_lock(&adev->gfx.kiq[inst].ring_lock);
> +     amdgpu_ring_alloc(ring, ndw);
> +     if (adev->gmc.flush_tlb_needs_extra_type_2)
> +             kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
> +
> +     if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> +             kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
> +
> +     kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> +     r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> +     if (r) {
> +             amdgpu_ring_undo(ring);
> +             spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> +             goto error_unlock_reset;
> +     }
> +
> +     amdgpu_ring_commit(ring);
> +     spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> +     r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> +     if (r < 1) {
> +             dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +             r = -ETIME;
> +             goto error_unlock_reset;
> +     }
> +     r = 0;
> +
> +error_unlock_reset:
> +     up_read(&adev->reset_domain->sem);
> +     return r;
> +}
> +
>  /**
>   * amdgpu_gmc_tmz_set -- check and set if a device supports TMZ
>   * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 9e7df2f69123..7732d4ef845e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -335,11 +335,12 @@ struct amdgpu_gmc {
>       u64 MC_VM_MX_L1_TLB_CNTL;
>
>       u64 noretry_flags;
> +
> +     bool flush_tlb_needs_extra_type_0;
> +     bool flush_tlb_needs_extra_type_2;
> +     bool flush_pasid_uses_kiq;
>  };
>
> -#define amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, type, allhub, inst) \
> -     ((adev)->gmc.gmc_funcs->flush_gpu_tlb_pasid \
> -     ((adev), (pasid), (type), (allhub), (inst)))
>  #define amdgpu_gmc_emit_flush_gpu_tlb(r, vmid, addr) (r)->adev-
> >gmc.gmc_funcs->emit_flush_gpu_tlb((r), (vmid), (addr))  #define
> amdgpu_gmc_emit_pasid_mapping(r, vmid, pasid) (r)->adev-
> >gmc.gmc_funcs->emit_pasid_mapping((r), (vmid), (pasid))  #define
> amdgpu_gmc_map_mtype(adev, flags) (adev)->gmc.gmc_funcs-
> >map_mtype((adev),(flags))
> @@ -404,6 +405,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device
> *adev);  int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device
> *adev);  void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev,
> uint32_t vmid,
>                             uint32_t vmhub, uint32_t flush_type);
> +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
> uint16_t pasid,
> +                                uint32_t flush_type, bool all_hub,
> +                                uint32_t inst);
>
>  extern void amdgpu_gmc_tmz_set(struct amdgpu_device *adev);  extern
> void amdgpu_gmc_noretry_set(struct amdgpu_device *adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 4f6990ba71cb..39016b6900d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -954,87 +954,30 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct
> amdgpu_device *adev,
>                                       uint16_t pasid, uint32_t flush_type,
>                                       bool all_hub, uint32_t inst)
>  {
> -     int vmid, i;
> -     signed long r;
> -     uint32_t seq;
> -     uint16_t queried_pasid;
> -     bool ret;
> -     u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT
> : adev->usec_timeout;
> -     struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
> -     struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
> -
> -     if (amdgpu_in_reset(adev))
> -             return -EIO;
> -
> -     if (ring->sched.ready && down_read_trylock(&adev->reset_domain-
> >sem)) {
> -             /* Vega20+XGMI caches PTEs in TC and TLB. Add a
> -              * heavy-weight TLB flush (type 2), which flushes
> -              * both. Due to a race condition with concurrent
> -              * memory accesses using the same TLB cache line, we
> -              * still need a second TLB flush after this.
> -              */
> -             bool vega20_xgmi_wa = (adev-
> >gmc.xgmi.num_physical_nodes &&
> -                                    adev->ip_versions[GC_HWIP][0] ==
> IP_VERSION(9, 4, 0));
> -             /* 2 dwords flush + 8 dwords fence */
> -             unsigned int ndw = kiq->pmf->invalidate_tlbs_size + 8;
> -
> -             if (vega20_xgmi_wa)
> -                     ndw += kiq->pmf->invalidate_tlbs_size;
> -
> -             spin_lock(&adev->gfx.kiq[inst].ring_lock);
> -             /* 2 dwords flush + 8 dwords fence */
> -             amdgpu_ring_alloc(ring, ndw);
> -             if (vega20_xgmi_wa)
> -                     kiq->pmf->kiq_invalidate_tlbs(ring,
> -                                                   pasid, 2, all_hub);
> -
> -             if (flush_type == 2 &&
> -                 adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) &&
> -                 adev->rev_id == 0)
> -                     kiq->pmf->kiq_invalidate_tlbs(ring,
> -                                             pasid, 0, all_hub);
> -
> -             kiq->pmf->kiq_invalidate_tlbs(ring,
> -                                     pasid, flush_type, all_hub);
> -             r = amdgpu_fence_emit_polling(ring, &seq,
> MAX_KIQ_REG_WAIT);
> -             if (r) {
> -                     amdgpu_ring_undo(ring);
> -                     spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -                     up_read(&adev->reset_domain->sem);
> -                     return -ETIME;
> -             }
> -
> -             amdgpu_ring_commit(ring);
> -             spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -             r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> -             if (r < 1) {
> -                     dev_err(adev->dev, "wait for kiq fence error: %ld.\n",
> r);
> -                     up_read(&adev->reset_domain->sem);
> -                     return -ETIME;
> -             }
> -             up_read(&adev->reset_domain->sem);
> -             return 0;
> -     }
> +     uint16_t queried;
> +     int i, vmid;
>
>       for (vmid = 1; vmid < 16; vmid++) {
> +             bool valid;
>
> -             ret = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev,
> vmid,
> -                             &queried_pasid);
> -             if (ret && queried_pasid == pasid) {
> -                     if (all_hub) {
> -                             for_each_set_bit(i, adev->vmhubs_mask,
> AMDGPU_MAX_VMHUBS)
> -                                     gmc_v9_0_flush_gpu_tlb(adev, vmid,
> -                                                     i, flush_type);
> -                     } else {
> -                             gmc_v9_0_flush_gpu_tlb(adev, vmid,
> -                                             AMDGPU_GFXHUB(0),
> flush_type);
> -                     }
> -                     break;
> +             valid = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev,
> vmid,
> +                                                              &queried);
> +             if (!valid || queried != pasid)
> +                     continue;
> +
> +             if (all_hub) {
> +                     for_each_set_bit(i, adev->vmhubs_mask,
> +                                      AMDGPU_MAX_VMHUBS)
> +                             gmc_v9_0_flush_gpu_tlb(adev, vmid, i,
> +                                                    flush_type);
> +             } else {
> +                     gmc_v9_0_flush_gpu_tlb(adev, vmid,
> +                                            AMDGPU_GFXHUB(0),
> +                                            flush_type);
>               }
>       }
>
>       return 0;
> -
>  }
>
>  static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring, @@
> -2335,6 +2278,24 @@ static int gmc_v9_0_hw_init(void *handle)
>       bool value;
>       int i, r;
>
> +     adev->gmc.flush_pasid_uses_kiq = true;
> +
> +     /* Vega20+XGMI caches PTEs in TC and TLB. Add a heavy-weight TLB
> flush
> +      * (type 2), which flushes both. Due to a race condition with
> +      * concurrent memory accesses using the same TLB cache line, we still
> +      * need a second TLB flush after this.
> +      */
> +     adev->gmc.flush_tlb_needs_extra_type_2 =
> +             adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 0) &&
> +             adev->gmc.xgmi.num_physical_nodes;
> +     /*
> +      * TODO: This workaround is badly documented and had a buggy
> +      * implementation. We should probably verify what we do here.
> +      */
> +     adev->gmc.flush_tlb_needs_extra_type_0 =
> +             adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) &&
> +             adev->rev_id == 0;
> +
>       /* The sequence of these two function calls matters.*/
>       gmc_v9_0_init_golden_registers(adev);
>
> --
> 2.34.1