Re: [PATCH 09/13] drm/amdgpu: accommodate DOMAIN/PL_DOORBELL

Alex Deucher <alexdeucher@xxxxxxxxx> · Mon, 6 Feb 2023 11:30:09 -0500

On Mon, Feb 6, 2023 at 6:30 AM Christian König <christian.koenig@xxxxxxx> wrote:
>
> Am 03.02.23 um 20:08 schrieb Shashank Sharma:
> > From: Alex Deucher <alexander.deucher@xxxxxxx>
> >
> > This patch adds changes to accommodate the new GEM/TTM domain
> > for doorbell memory.
> >
> > Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_bar_mgr.c | 19 ++++++++++------
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  3 ++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 24 ++++++++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     | 17 ++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  3 ++-
> >   7 files changed, 58 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index e3e2e6e3b485..e1c1a360614e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -974,6 +974,7 @@ struct amdgpu_device {
> >       atomic64_t vram_pin_size;
> >       atomic64_t visible_pin_size;
> >       atomic64_t gart_pin_size;
> > +     atomic64_t doorbell_pin_size;
>
> Please drop that, the amount of pinned doorbells is not needed as far as
> I can see.
>
> >
> >       /* soc15 register offset based on ip, instance and  segment */
> >       uint32_t                *reg_offset[MAX_HWIP][HWIP_MAX_INSTANCE];
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bar_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bar_mgr.c
> > index 0656e5bb4f05..43a3137019b1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bar_mgr.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bar_mgr.c
> > @@ -659,15 +659,17 @@ static void amdgpu_bar_mgr_del(struct ttm_resource_manager *man,
> >    * @dev: the other device
> >    * @dir: dma direction
> >    * @sgt: resulting sg table
> > + * @mem_type: memory type
> >    *
> >    * Allocate and fill a sg table from a VRAM allocation.
> >    */
> >   int amdgpu_bar_mgr_alloc_sgt(struct amdgpu_device *adev,
> > -                           struct ttm_resource *res,
> > -                           u64 offset, u64 length,
> > -                           struct device *dev,
> > -                           enum dma_data_direction dir,
> > -                           struct sg_table **sgt)
> > +                          struct ttm_resource *res,
> > +                          u64 offset, u64 length,
> > +                          struct device *dev,
> > +                          enum dma_data_direction dir,
> > +                          struct sg_table **sgt,
> > +                          u32 mem_type)
>
> And again that doesn't make any sense at all.
>
> For now we don't want to export doorbells through DMA-buf.
>
> >   {
> >       struct amdgpu_res_cursor cursor;
> >       struct scatterlist *sg;
> > @@ -701,10 +703,15 @@ int amdgpu_bar_mgr_alloc_sgt(struct amdgpu_device *adev,
> >        */
> >       amdgpu_res_first(res, offset, length, &cursor);
> >       for_each_sgtable_sg((*sgt), sg, i) {
> > -             phys_addr_t phys = cursor.start + adev->gmc.vram_aper_base;
> > +             phys_addr_t phys = cursor.start;
> >               size_t size = cursor.size;
> >               dma_addr_t addr;
> >
> > +             if (mem_type == TTM_PL_VRAM)
> > +                     phys += adev->gmc.vram_aper_base;
> > +             else
> > +                     phys += adev->gmc.doorbell_aper_base;
> > +
> >               addr = dma_map_resource(dev, phys, size, dir,
> >                                       DMA_ATTR_SKIP_CPU_SYNC);
> >               r = dma_mapping_error(dev, addr);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> > index c48ccde281c3..c645bdc49f34 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> > @@ -179,9 +179,10 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach,
> >               break;
> >
> >       case TTM_PL_VRAM:
> > +     case AMDGPU_PL_DOORBELL:
> >               r = amdgpu_bar_mgr_alloc_sgt(adev, bo->tbo.resource, 0,
> >                                            bo->tbo.base.size, attach->dev,
> > -                                          dir, &sgt);
> > +                                          dir, &sgt, bo->tbo.resource->mem_type);
> >               if (r)
> >                       return ERR_PTR(r);
> >               break;
>
> That stuff can be dropped as well as far as I can see.
>
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > index 887fc53a7d16..b2cfd46c459b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > @@ -147,6 +147,18 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
> >               c++;
> >       }
> >
> > +     if (domain & AMDGPU_GEM_DOMAIN_DOORBELL) {
> > +             places[c].fpfn = 0;
> > +             places[c].lpfn = 0;
> > +             places[c].mem_type = AMDGPU_PL_DOORBELL;
> > +             places[c].flags = 0;
> > +             places[c].flags |= TTM_PL_FLAG_TOPDOWN;
> > +
> > +             if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
> > +                     places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
> > +             c++;
> > +     }
> > +
> >       if (domain & AMDGPU_GEM_DOMAIN_GTT) {
> >               places[c].fpfn = 0;
> >               places[c].lpfn = 0;
> > @@ -464,6 +476,13 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
> >               if (man && size < man->size)
> >                       return true;
> >               goto fail;
> > +     } else if (domain & AMDGPU_GEM_DOMAIN_DOORBELL) {
> > +             man = ttm_manager_type(&adev->mman.bdev, AMDGPU_PL_DOORBELL);
> > +
> > +             if (size < man->size)
> > +                     return true;
> > +             else
> > +                     goto fail;
>
> Do we ever want userspace to allocate more than one doorbell page at a time?

One 4k page would support 512 64 bit doorbells and hence 512 user
queues.  That seems like a reasonable queue limit.

Alex

>
> >       }
> >
> >       /* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */
> > @@ -962,8 +981,9 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
> >                            &adev->visible_pin_size);
> >       } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
> >               atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
> > +     } else if (domain == AMDGPU_GEM_DOMAIN_DOORBELL) {
> > +             atomic64_add(amdgpu_bo_size(bo), &adev->doorbell_pin_size);
>
> Can be dropped.
>
> >       }
> > -
> >   error:
> >       return r;
> >   }
> > @@ -1013,6 +1033,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
> >                            &adev->visible_pin_size);
> >       } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
> >               atomic64_sub(amdgpu_bo_size(bo), &adev->gart_pin_size);
> > +     } else if (bo->tbo.resource->mem_type == AMDGPU_PL_DOORBELL) {
> > +             atomic64_sub(amdgpu_bo_size(bo), &adev->doorbell_pin_size);
>
> Dito.
>
> >       }
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > index 93207badf83f..082f451d26f4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > @@ -326,7 +326,7 @@ int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
> >   u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
> >   u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
> >   void amdgpu_bo_get_memory(struct amdgpu_bo *bo, uint64_t *vram_mem,
> > -                             uint64_t *gtt_mem, uint64_t *cpu_mem);
> > +                       uint64_t *gtt_mem, uint64_t *cpu_mem);
> >   void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo);
> >   int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
> >                            struct dma_fence **fence);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index bb2230d14ea6..71eff2f195a7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -128,6 +128,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo,
> >       case AMDGPU_PL_GDS:
> >       case AMDGPU_PL_GWS:
> >       case AMDGPU_PL_OA:
> > +     case AMDGPU_PL_DOORBELL:
> >               placement->num_placement = 0;
> >               placement->num_busy_placement = 0;
> >               return;
> > @@ -500,9 +501,11 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> >       if (old_mem->mem_type == AMDGPU_PL_GDS ||
> >           old_mem->mem_type == AMDGPU_PL_GWS ||
> >           old_mem->mem_type == AMDGPU_PL_OA ||
> > +         old_mem->mem_type == AMDGPU_PL_DOORBELL ||
> >           new_mem->mem_type == AMDGPU_PL_GDS ||
> >           new_mem->mem_type == AMDGPU_PL_GWS ||
> > -         new_mem->mem_type == AMDGPU_PL_OA) {
> > +         new_mem->mem_type == AMDGPU_PL_OA ||
> > +         new_mem->mem_type == AMDGPU_PL_DOORBELL) {
> >               /* Nothing to save here */
> >               ttm_bo_move_null(bo, new_mem);
> >               goto out;
> > @@ -586,6 +589,17 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev,
> >               mem->bus.offset += adev->gmc.vram_aper_base;
> >               mem->bus.is_iomem = true;
> >               break;
> > +     case AMDGPU_PL_DOORBELL:
> > +             mem->bus.offset = mem->start << PAGE_SHIFT;
>
> That here won't work if we ever allow allocating more than one page for
> a doorbell.
>
> > +
> > +             if (adev->mman.doorbell_aper_base_kaddr &&
> > +                 mem->placement & TTM_PL_FLAG_CONTIGUOUS)
> > +                     mem->bus.addr = (u8 *)adev->mman.doorbell_aper_base_kaddr +
> > +                                     mem->bus.offset;
>
> This doesn't make any sense at all. TTM_PL_FLAG_CONTIGUOUS should
> probably be completely ignored for doorbells.
>
> Regards,
> Christian.
>
> > +
> > +             mem->bus.offset += adev->gmc.doorbell_aper_base;
> > +             mem->bus.is_iomem = true;
> > +             break;
> >       default:
> >               return -EINVAL;
> >       }
> > @@ -1267,6 +1281,7 @@ uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem)
> >               flags |= AMDGPU_PTE_VALID;
> >
> >       if (mem && (mem->mem_type == TTM_PL_TT ||
> > +                 mem->mem_type == AMDGPU_PL_DOORBELL ||
> >                   mem->mem_type == AMDGPU_PL_PREEMPT)) {
> >               flags |= AMDGPU_PTE_SYSTEM;
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > index 243deb1ffc54..9971665d7d99 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > @@ -124,7 +124,8 @@ int amdgpu_bar_mgr_alloc_sgt(struct amdgpu_device *adev,
> >                            u64 offset, u64 size,
> >                            struct device *dev,
> >                            enum dma_data_direction dir,
> > -                          struct sg_table **sgt);
> > +                          struct sg_table **sgt,
> > +                          u32 mem_type);
> >   void amdgpu_bar_mgr_free_sgt(struct device *dev,
> >                            enum dma_data_direction dir,
> >                            struct sg_table *sgt);
>