Re: [PATCH v5 4/5] drm/amdgpu: address remove from fault filter

philip yang <yangp@xxxxxxx> · Wed, 28 Apr 2021 10:57:02 -0400



    On 2021-04-28 5:00 a.m., Christian
      König wrote:

    
      Am 28.04.21 um 09:53 schrieb Felix Kuehling:
      

      Am 2021-04-28 um 2:54 a.m. schrieb
        Christian König:
        

        Am 27.04.21 um 20:21 schrieb Felix
          Kuehling:
          

          On 2021-04-27 10:51 a.m., Philip Yang
            wrote:
            

            Add interface to remove address from
              fault filter ring by resetting
              

              fault ring entry key, then future vm fault on the address
              will be
              

              processed to recover.
              

              Define fault key as atomic64_t type to use atomic
              read/set/cmpxchg key
              

              to protect fault ring access by interrupt handler and
              interrupt
              

              deferred
              

              work for vg20. Change fault->timestamp to 48-bit to
              share same uint64_t
              

              with 8-bit fault->next, it is enough for 48bit IH
              timestamp.
              

              Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
              

              ---
              

                 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48
              

              ++++++++++++++++++++++---
              

                 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
              

                 2 files changed, 48 insertions(+), 6 deletions(-)
              

              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
              

              b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
              

              index c39ed9eb0987..a2e81e913abb 100644
              

              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
              

              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
              

              @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
              

              amdgpu_device *adev, struct amdgpu_gmc *mc)
              

                             mc->agp_size >> 20,
              mc->agp_start, mc->agp_end);
              

                 }
              

                 +/**
              

              + * amdgpu_gmc_fault_key - get hask key from vm fault
              address and pasid
              

              + *
              

              + * @addr: 48bit physical address
              

              + * @pasid: 4 bit
              

            This comment is misleading. The PASID is not 4-bit. It's
            16-bit. But
            

            shifting the address by 4 bit is sufficient because the
            address is
            

            page-aligned, which means the low 12 bits are 0. So this
            would be
            

            more accurate:
            

            @addr: 48 bit physical address, page aligned (36 significant
            bits)
            

            @pasid: 16 bit process address space identifier
            

            With that fixed, the patch is
            

            Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
            

            + */
              

              +static inline uint64_t amdgpu_gmc_fault_key(uint64_t
              addr, uint16_t
              

              pasid)
              

              +{
              

              +    return addr << 4 | pasid;
              

              +}
              

          Maybe changing this so that we have "addr * ((1 << 16) /
          

          AMDGPU_PAGE_SIZE) | pasid" would help to better document that?
          

        I find this a mix of multiplication, division and bit-shift more
        

        confusing. How about "addr << (16 - AMDGPU_GPU_PAGE_SHIFT)
        | pasid"?
        

      Yeah, that is even better.
      

    I have pushed the patch yesterday, didn't pick this, should wait
      longer before push in future.
    Thanks,
    Philip

    
      Regards,
      

      Christian.
      

        Regards,
        

           Felix
        

        Christian.
          

            +
              

                 /**
              

                  * amdgpu_gmc_filter_faults - filter VM faults
              

                  *
              

              @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct
              

              amdgpu_device *adev, uint64_t addr,
              

                                   uint16_t pasid, uint64_t timestamp)
              

                 {
              

                     struct amdgpu_gmc *gmc = &adev->gmc;
              

              -
              

              -    uint64_t stamp, key = addr << 4 | pasid;
              

              +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr,
              pasid);
              

                     struct amdgpu_gmc_fault *fault;
              

                     uint32_t hash;
              

                 @@ -365,7 +375,7 @@ bool
              amdgpu_gmc_filter_faults(struct
              

              amdgpu_device *adev, uint64_t addr,
              

                     while (fault->timestamp >= stamp) {
              

                         uint64_t tmp;
              

                 -        if (fault->key == key)
              

              +        if (atomic64_read(&fault->key) == key)
              

                             return true;
              

                           tmp = fault->timestamp;
              

              @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct
              

              amdgpu_device *adev, uint64_t addr,
              

                       /* Add the fault to the ring */
              

                     fault =
              &gmc->fault_ring[gmc->last_fault];
              

              -    fault->key = key;
              

              +    atomic64_set(&fault->key, key);
              

                     fault->timestamp = timestamp;
              

                       /* And update the hash */
              

              @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct
              

              amdgpu_device *adev, uint64_t addr,
              

                     return false;
              

                 }
              

                 +/**
              

              + * amdgpu_gmc_filter_faults_remove - remove address from
              VM faults
              

              filter
              

              + *
              

              + * @adev: amdgpu device structure
              

              + * @addr: address of the VM fault
              

              + * @pasid: PASID of the process causing the fault
              

              + *
              

              + * Remove the address from fault filter, then future vm
              fault on
              

              this address
              

              + * will pass to retry fault handler to recover.
              

              + */
              

              +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
              *adev,
              

              uint64_t addr,
              

              +                     uint16_t pasid)
              

              +{
              

              +    struct amdgpu_gmc *gmc = &adev->gmc;
              

              +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
              

              +    struct amdgpu_gmc_fault *fault;
              

              +    uint32_t hash;
              

              +    uint64_t tmp;
              

              +
              

              +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
              

              +    fault =
              &gmc->fault_ring[gmc->fault_hash[hash].idx];
              

              +    do {
              

              +        if (atomic64_cmpxchg(&fault->key, key, 0)
              == key)
              

              +            break;
              

              +
              

              +        tmp = fault->timestamp;
              

              +        fault = &gmc->fault_ring[fault->next];
              

              +    } while (fault->timestamp < tmp);
              

              +}
              

              +
              

                 int amdgpu_gmc_ras_late_init(struct amdgpu_device
              *adev)
              

                 {
              

                     int r;
              

              diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
              

              b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
              

              index 9d11c02a3938..6aa1d52d3aee 100644
              

              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
              

              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
              

              @@ -66,9 +66,9 @@ struct firmware;
              

                  * GMC page fault information
              

                  */
              

                 struct amdgpu_gmc_fault {
              

              -    uint64_t    timestamp;
              

              +    uint64_t    timestamp:48;
              

                     uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
              

              -    uint64_t    key:52;
              

              +    atomic64_t    key;
              

                 };
              

                   /*
              

              @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
              

              amdgpu_device *adev,
              

                                  struct amdgpu_gmc *mc);
              

                 bool amdgpu_gmc_filter_faults(struct amdgpu_device
              *adev, uint64_t
              

              addr,
              

                                   uint16_t pasid, uint64_t timestamp);
              

              +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
              *adev,
              

              uint64_t addr,
              

              +                     uint16_t pasid);
              

                 int amdgpu_gmc_ras_late_init(struct amdgpu_device
              *adev);
              

                 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
              

                 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device
              *adev);
              

            _______________________________________________
            

            amd-gfx mailing list
            

            amd-gfx@xxxxxxxxxxxxxxxxxxxxx
            

https://nam11.safelinks.protection.outlook.com/?url="">
            

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx