Re: [PATCH v3 1/3] drm/amdgpu: replace TLB seq callback with HW seq

"Sharma, Shashank" <Shashank.Sharma@xxxxxxx> · Mon, 26 Feb 2024 16:22:09 +0000

[AMD Official Use Only - General]

Please feel free to use:

Reviewed-by: Shashank Sharma <shashank.sharma@xxxxxxx>

Regards

Shashank

From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx>

Sent: Monday, February 26, 2024 3:45 PM

To: Sharma, Shashank <Shashank.Sharma@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>

Cc: Koenig, Christian <Christian.Koenig@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@xxxxxxx>

Subject: Re: [PATCH v3 1/3] drm/amdgpu: replace TLB seq callback with HW seq

Am 23.02.24 um 14:42 schrieb Shashank Sharma:

> From: Christian König <christian.koenig@xxxxxxx>

>

> The callback we installed for the SDMA update were actually pretty

> horrible. since we now have seq64 use that one and HW seq writes

> instead.

>

> V2:(Shashank)

>   - rebased on amd-drm-staging-next

>   - changed amdgpu_seq64_gpu_addr

>

> Cc: Christian König <christian.koenig@xxxxxxx>

> Cc: Alex Deucher <alexander.deucher@xxxxxxx>

> Cc: Felix Kuehling <felix.kuehling@xxxxxxx>

> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx>

> Signed-off-by: Christian König <christian.koenig@xxxxxxx>

Shashank can I get an rb on this patch here so that I can push it to

amd-staging-drm-next?

The patch was basically just to double check if the seq64 code works as

intended.

Thanks,

Christian.

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c   | 14 ++++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h   |  1 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 79 ++++-----------------

>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      | 27 ++-----

>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  |  3 +-

>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |  2 +-

>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  5 ++

>   7 files changed, 42 insertions(+), 89 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

> index 3d0d56087d41..300dc79fa2b9 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c

> @@ -199,6 +199,20 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)

>                __clear_bit(bit_pos, adev->seq64.used);

>   }

>  

> +/**

> + * amdgpu_seq64_gpu_addr - Calculate GPU addr from va

> + *

> + * @adev: amdgpu_device pointer

> + * @va: virtual address in client address space

> + *

> + * Calculate the GART address for a VA.

> + */

> +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va)

> +{

> +     return va - amdgpu_seq64_get_va_base(adev) +

> +             amdgpu_bo_gpu_offset(adev->seq64.sbo);

> +}

> +

>   /**

>    * amdgpu_seq64_fini - Cleanup seq64 driver

>    *

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h

> index 4203b2ab318d..63e8ac0a2057 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h

> @@ -43,6 +43,7 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 gpu_addr);

>   int amdgpu_seq64_map(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>                     struct amdgpu_bo_va **bo_va);

>   void amdgpu_seq64_unmap(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv);

> +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va);

>  

>   #endif

>  

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> index ed4a8c5d26d7..0960e0a665d3 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> @@ -111,21 +111,6 @@ struct amdgpu_prt_cb {

>        struct dma_fence_cb cb;

>   };

>  

> -/**

> - * struct amdgpu_vm_tlb_seq_struct - Helper to increment the TLB flush sequence

> - */

> -struct amdgpu_vm_tlb_seq_struct {

> -     /**

> -      * @vm: pointer to the amdgpu_vm structure to set the fence sequence on

> -      */

> -     struct amdgpu_vm *vm;

> -

> -     /**

> -      * @cb: callback

> -      */

> -     struct dma_fence_cb cb;

> -};

> -

>   /**

>    * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping

>    *

> @@ -862,23 +847,6 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,

>        return r;

>   }

>  

> -/**

> - * amdgpu_vm_tlb_seq_cb - make sure to increment tlb sequence

> - * @fence: unused

> - * @cb: the callback structure

> - *

> - * Increments the tlb sequence to make sure that future CS execute a VM flush.

> - */

> -static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,

> -                              struct dma_fence_cb *cb)

> -{

> -     struct amdgpu_vm_tlb_seq_struct *tlb_cb;

> -

> -     tlb_cb = container_of(cb, typeof(*tlb_cb), cb);

> -     atomic64_inc(&tlb_cb->vm->tlb_seq);

> -     kfree(tlb_cb);

> -}

> -

>   /**

>    * amdgpu_vm_update_range - update a range in the vm page table

>    *

> @@ -911,7 +879,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>                           struct dma_fence **fence)

>   {

>        struct amdgpu_vm_update_params params;

> -     struct amdgpu_vm_tlb_seq_struct *tlb_cb;

>        struct amdgpu_res_cursor cursor;

>        enum amdgpu_sync_mode sync_mode;

>        int r, idx;

> @@ -919,12 +886,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>        if (!drm_dev_enter(adev_to_drm(adev), &idx))

>                return -ENODEV;

>  

> -     tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);

> -     if (!tlb_cb) {

> -             r = -ENOMEM;

> -             goto error_unlock;

> -     }

> -

>        /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,

>         * heavy-weight flush TLB unconditionally.

>         */

> @@ -942,6 +903,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>        params.immediate = immediate;

>        params.pages_addr = pages_addr;

>        params.unlocked = unlocked;

> +     params.needs_flush = flush_tlb;

>        params.allow_override = allow_override;

>  

>        /* Implicitly sync to command submissions in the same VM before

> @@ -955,7 +917,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>        amdgpu_vm_eviction_lock(vm);

>        if (vm->evicting) {

>                r = -EBUSY;

> -             goto error_free;

> +             goto error_unlock;

>        }

>  

>        if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {

> @@ -968,7 +930,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>  

>        r = vm->update_funcs->prepare(&params, resv, sync_mode);

>        if (r)

> -             goto error_free;

> +             goto error_unlock;

>  

>        amdgpu_res_first(pages_addr ? NULL : res, offset,

>                         (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);

> @@ -1018,7 +980,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>                tmp = start + num_entries;

>                r = amdgpu_vm_ptes_update(&params, start, tmp, addr, flags);

>                if (r)

> -                     goto error_free;

> +                     goto error_unlock;

>  

>                amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE);

>                start = tmp;

> @@ -1026,22 +988,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>  

>        r = vm->update_funcs->commit(&params, fence);

>  

> -     if (flush_tlb || params.table_freed) {

> -             tlb_cb->vm = vm;

> -             if (fence && *fence &&

> -                 !dma_fence_add_callback(*fence, &tlb_cb->cb,

> -                                        amdgpu_vm_tlb_seq_cb)) {

> -                     dma_fence_put(vm->last_tlb_flush);

> -                     vm->last_tlb_flush = dma_fence_get(*fence);

> -             } else {

> -                     amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);

> -             }

> -             tlb_cb = NULL;

> -     }

> -

> -error_free:

> -     kfree(tlb_cb);

> -

>   error_unlock:

>        amdgpu_vm_eviction_unlock(vm);

>        drm_dev_exit(idx);

> @@ -2260,10 +2206,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>        INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);

>        INIT_KFIFO(vm->faults);

>  

> -     r = amdgpu_vm_init_entities(adev, vm);

> +     r = amdgpu_seq64_alloc(adev, &vm->tlb_seq_va, &vm->tlb_seq_cpu_addr);

>        if (r)

>                return r;

>  

> +     r = amdgpu_vm_init_entities(adev, vm);

> +     if (r)

> +             goto error_free_seq;

> +

>        vm->pte_support_ats = false;

>        vm->is_compute_context = false;

>  

> @@ -2283,7 +2233,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>  

>        vm->last_update = dma_fence_get_stub();

>        vm->last_unlocked = dma_fence_get_stub();

> -     vm->last_tlb_flush = dma_fence_get_stub();

>        vm->generation = 0;

>  

>        mutex_init(&vm->eviction_lock);

> @@ -2322,10 +2271,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,

>        amdgpu_bo_unref(&root_bo);

>  

>   error_free_delayed:

> -     dma_fence_put(vm->last_tlb_flush);

>        dma_fence_put(vm->last_unlocked);

>        amdgpu_vm_fini_entities(vm);

>  

> +error_free_seq:

> +     amdgpu_seq64_free(adev, vm->tlb_seq_va);

>        return r;

>   }

>  

> @@ -2441,7 +2391,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

>        struct amdgpu_bo_va_mapping *mapping, *tmp;

>        bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;

>        struct amdgpu_bo *root;

> -     unsigned long flags;

>        int i;

>  

>        amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);

> @@ -2453,11 +2402,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

>        amdgpu_vm_set_pasid(adev, vm, 0);

>        dma_fence_wait(vm->last_unlocked, false);

>        dma_fence_put(vm->last_unlocked);

> -     dma_fence_wait(vm->last_tlb_flush, false);

> -     /* Make sure that all fence callbacks have completed */

> -     spin_lock_irqsave(vm->last_tlb_flush->lock, flags);

> -     spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags);

> -     dma_fence_put(vm->last_tlb_flush);

> +     amdgpu_seq64_free(adev, vm->tlb_seq_va);

>  

>        list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {

>                if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

> index 666698a57192..ac9380afcb69 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

> @@ -247,9 +247,9 @@ struct amdgpu_vm_update_params {

>        unsigned int num_dw_left;

>  

>        /**

> -      * @table_freed: return true if page table is freed when updating

> +      * @needs_flush: true whenever we need to invalidate the TLB

>         */

> -     bool table_freed;

> +     bool needs_flush;

>  

>        /**

>         * @allow_override: true for memory that is not uncached: allows MTYPE

> @@ -328,9 +328,11 @@ struct amdgpu_vm {

>        struct drm_sched_entity immediate;

>        struct drm_sched_entity delayed;

>  

> -     /* Last finished delayed update */

> +     /* Sequence number indicating necessary TLB flush */

>        atomic64_t              tlb_seq;

> -     struct dma_fence        *last_tlb_flush;

> +     uint64_t                tlb_seq_va;

> +     uint64_t                *tlb_seq_cpu_addr;

> +

>        atomic64_t              kfd_last_flushed_seq;

>  

>        /* How many times we had to re-generate the page tables */

> @@ -549,22 +551,7 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm);

>    */

>   static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm)

>   {

> -     unsigned long flags;

> -     spinlock_t *lock;

> -

> -     /*

> -      * Workaround to stop racing between the fence signaling and handling

> -      * the cb. The lock is static after initially setting it up, just make

> -      * sure that the dma_fence structure isn't freed up.

> -      */

> -     rcu_read_lock();

> -     lock = vm->last_tlb_flush->lock;

> -     rcu_read_unlock();

> -

> -     spin_lock_irqsave(lock, flags);

> -     spin_unlock_irqrestore(lock, flags);

> -

> -     return atomic64_read(&vm->tlb_seq);

> +     return READ_ONCE(*vm->tlb_seq_cpu_addr);

>   }

>  

>   /*

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c

> index 6e31621452de..0f8fd0acef7b 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c

> @@ -108,7 +108,8 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,

>   static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,

>                                struct dma_fence **fence)

>   {

> -     /* Flush HDP */

> +     if (p->needs_flush)

> +             *p->vm->tlb_seq_cpu_addr = atomic64_inc_return(&p->vm->tlb_seq);

>        mb();

>        amdgpu_device_flush_hdp(p->adev, NULL);

>        return 0;

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

> index a160265ddc07..95dc0afdaffb 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

> @@ -1056,7 +1056,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,

>                        while (cursor.pfn < frag_start) {

>                                /* Make sure previous mapping is freed */

>                                if (cursor.entry->bo) {

> -                                     params->table_freed = true;

> +                                     params->needs_flush = true;

>                                        amdgpu_vm_pt_free_dfs(adev, params->vm,

>                                                              &cursor,

>                                                              params->unlocked);

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c

> index 349416e176a1..91cc3121fd4b 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c

> @@ -126,6 +126,11 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,

>  

>        WARN_ON(ib->length_dw == 0);

>        amdgpu_ring_pad_ib(ring, ib);

> +     if (p->needs_flush) {

> +             p->job->uf_addr = amdgpu_seq64_gpu_addr(p->adev,

> +                                                     p->vm->tlb_seq_va);

> +             p->job->uf_sequence = atomic64_inc_return(&p->vm->tlb_seq);

> +     }

>        WARN_ON(ib->length_dw > p->num_dw_left);

>        f = amdgpu_job_submit(p->job);

>