On 07/20/2017 04:20 AM, zhoucm1 wrote: > > > On 2017å¹´07æ??19æ?¥ 23:34, Marek Olšák wrote: >> >> >> On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou at amd.com >> <mailto:david1.zhou at amd.com>> wrote: >> >> >> >> On 2017å¹´07æ??19æ?¥ 04:08, Marek Olšák wrote: >> >> From: Marek Olšák <marek.olsak at amd.com >> <mailto:marek.olsak at amd.com>> >> >> For lower overhead in the CS ioctl. >> Winsys allocators are not used with interprocess-sharable >> resources. >> >> Hi Marek, >> >> Could I know from how your this way reduces overhead in CS ioctl? >> reusing BO to short bo list? >> >> >> The kernel part of the work hasn't been done yet. The idea is that >> nonsharable buffers don't have to be revalidated by TTM, > OK, Maybe I only can see the whole picture of this idea when you > complete kernel part. > Out of curious, why/how can nonsharable buffers be revalidated by TTM > without exposing like amdgpu_bo_make_resident api? > > With mentioned in another thread, if we can expose make_resident api, we > can remove bo_list, even we can remove reservation operation in CS ioctl. > And now, I think our bo list is a very bad design, > first, umd must create bo list for every command submission, this is a > extra cpu overhead compared with traditional way. > second, kernel also have to iterate the list, when bo list is too long, > like OpenCL program, they always throw several thousands BOs to bo list, > reservation must keep these thousands ww_mutex safe, CPU overhead is too > big. > > So I strongly suggest we should expose make_resident api to user space. > if cannot, I want to know any specific reason to see if we can solve it. Introducing a make_resident API will also help ARB_bindless_texture a lot, because currently when a texture is marked as resident, we have to re-validate the related buffers for every new CS, like traditional buffers. With a resident BO list the whole mechanism could be skipped. > > > Regards, > David Zhou >> so it should remove a lot of kernel overhead and the BO list remains >> the same. >> >> Marek >> >> >> >> Thanks, >> David Zhou >> >> >> v2: It shouldn't crash anymore, but the kernel will reject the >> new flag. >> --- >> src/gallium/drivers/radeon/r600_buffer_common.c | 7 +++++ >> src/gallium/drivers/radeon/radeon_winsys.h | 20 +++++++++++--- >> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 36 >> ++++++++++++++++--------- >> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 27 >> +++++++++++-------- >> 4 files changed, 62 insertions(+), 28 deletions(-) >> >> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c >> b/src/gallium/drivers/radeon/r600_buffer_common.c >> index dd1c209..2747ac4 100644 >> --- a/src/gallium/drivers/radeon/r600_buffer_common.c >> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c >> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct >> r600_common_screen *rscreen, >> } >> /* Tiled textures are unmappable. Always put them in >> VRAM. */ >> if ((res->b.b.target != PIPE_BUFFER && >> !rtex->surface.is_linear) || >> res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) { >> res->domains = RADEON_DOMAIN_VRAM; >> res->flags |= RADEON_FLAG_NO_CPU_ACCESS | >> RADEON_FLAG_GTT_WC; >> } >> + /* Only displayable single-sample textures can be >> shared between >> + * processes. */ >> + if (res->b.b.target == PIPE_BUFFER || >> + res->b.b.nr_samples >= 2 || >> + rtex->surface.micro_tile_mode != >> RADEON_MICRO_MODE_DISPLAY) >> + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; >> + >> /* If VRAM is just stolen system memory, allow both >> VRAM and >> * GTT, whichever has free space. If a buffer is >> evicted from >> * VRAM to GTT, it will stay there. >> * >> * DRM 3.6.0 has good BO move throttling, so we can >> allow VRAM-only >> * placements even with a low amount of stolen VRAM. >> */ >> if (!rscreen->info.has_dedicated_vram && >> (rscreen->info.drm_major < 3 || >> rscreen->info.drm_minor < 6) && >> res->domains == RADEON_DOMAIN_VRAM) { >> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h >> b/src/gallium/drivers/radeon/radeon_winsys.h >> index 351edcd..0abcb56 100644 >> --- a/src/gallium/drivers/radeon/radeon_winsys.h >> +++ b/src/gallium/drivers/radeon/radeon_winsys.h >> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */ >> RADEON_DOMAIN_GTT = 2, >> RADEON_DOMAIN_VRAM = 4, >> RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | >> RADEON_DOMAIN_GTT >> }; >> enum radeon_bo_flag { /* bitfield */ >> RADEON_FLAG_GTT_WC = (1 << 0), >> RADEON_FLAG_NO_CPU_ACCESS = (1 << 1), >> RADEON_FLAG_NO_SUBALLOC = (1 << 2), >> RADEON_FLAG_SPARSE = (1 << 3), >> + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4), >> }; >> enum radeon_bo_usage { /* bitfield */ >> RADEON_USAGE_READ = 2, >> RADEON_USAGE_WRITE = 4, >> RADEON_USAGE_READWRITE = RADEON_USAGE_READ | >> RADEON_USAGE_WRITE, >> /* The winsys ensures that the CS submission will be >> scheduled after >> * previously flushed CSs referencing this BO in a >> conflicting way. >> */ >> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain >> radeon_domain_from_heap(enum radeon_heap hea >> default: >> assert(0); >> return (enum radeon_bo_domain)0; >> } >> } >> static inline unsigned radeon_flags_from_heap(enum >> radeon_heap heap) >> { >> switch (heap) { >> case RADEON_HEAP_VRAM_NO_CPU_ACCESS: >> - return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS; >> + return RADEON_FLAG_GTT_WC | >> + RADEON_FLAG_NO_CPU_ACCESS | >> + RADEON_FLAG_NO_INTERPROCESS_SHARING; >> + >> case RADEON_HEAP_VRAM: >> case RADEON_HEAP_VRAM_GTT: >> case RADEON_HEAP_GTT_WC: >> - return RADEON_FLAG_GTT_WC; >> + return RADEON_FLAG_GTT_WC | >> + RADEON_FLAG_NO_INTERPROCESS_SHARING; >> + >> case RADEON_HEAP_GTT: >> default: >> - return 0; >> + return RADEON_FLAG_NO_INTERPROCESS_SHARING; >> } >> } >> /* The pb cache bucket is chosen to minimize pb_cache misses. >> * It must be between 0 and 3 inclusive. >> */ >> static inline unsigned radeon_get_pb_cache_bucket_index(enum >> radeon_heap heap) >> { >> switch (heap) { >> case RADEON_HEAP_VRAM_NO_CPU_ACCESS: >> @@ -724,22 +730,28 @@ static inline unsigned >> radeon_get_pb_cache_bucket_index(enum radeon_heap heap) >> /* Return the heap index for winsys allocators, or -1 on >> failure. */ >> static inline int radeon_get_heap_index(enum >> radeon_bo_domain domain, >> enum radeon_bo_flag >> flags) >> { >> /* VRAM implies WC (write combining) */ >> assert(!(domain & RADEON_DOMAIN_VRAM) || flags & >> RADEON_FLAG_GTT_WC); >> /* NO_CPU_ACCESS implies VRAM only. */ >> assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == >> RADEON_DOMAIN_VRAM); >> + /* Resources with interprocess sharing don't use any >> winsys allocators. */ >> + if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)) >> + return -1; >> + >> /* Unsupported flags: NO_SUBALLOC, SPARSE. */ >> - if (flags & ~(RADEON_FLAG_GTT_WC | >> RADEON_FLAG_NO_CPU_ACCESS)) >> + if (flags & ~(RADEON_FLAG_GTT_WC | >> + RADEON_FLAG_NO_CPU_ACCESS | >> + RADEON_FLAG_NO_INTERPROCESS_SHARING)) >> return -1; >> switch (domain) { >> case RADEON_DOMAIN_VRAM: >> if (flags & RADEON_FLAG_NO_CPU_ACCESS) >> return RADEON_HEAP_VRAM_NO_CPU_ACCESS; >> else >> return RADEON_HEAP_VRAM; >> case RADEON_DOMAIN_VRAM_GTT: >> return RADEON_HEAP_VRAM_GTT; >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> index 97bbe23..06b8198 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> @@ -31,20 +31,24 @@ >> #include "amdgpu_cs.h" >> #include "os/os_time.h" >> #include "state_tracker/drm_driver.h" >> #include <amdgpu_drm.h> >> #include <xf86drm.h> >> #include <stdio.h> >> #include <inttypes.h> >> +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING >> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6) >> +#endif >> + >> /* Set to 1 for verbose output showing committed sparse >> buffer ranges. */ >> #define DEBUG_SPARSE_COMMITS 0 >> struct amdgpu_sparse_backing_chunk { >> uint32_t begin, end; >> }; >> static struct pb_buffer * >> amdgpu_bo_create(struct radeon_winsys *rws, >> uint64_t size, >> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo >> *amdgpu_create_bo(struct amdgpu_winsys *ws, >> if (initial_domain & RADEON_DOMAIN_VRAM) >> request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; >> if (initial_domain & RADEON_DOMAIN_GTT) >> request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; >> if (flags & RADEON_FLAG_NO_CPU_ACCESS) >> request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; >> if (flags & RADEON_FLAG_GTT_WC) >> request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; >> + if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) >> + request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING; >> r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); >> if (r) { >> fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); >> fprintf(stderr, "amdgpu: size : %"PRIu64" >> bytes\n", size); >> fprintf(stderr, "amdgpu: alignment : %u bytes\n", >> alignment); >> fprintf(stderr, "amdgpu: domains : %u\n", >> initial_domain); >> goto error_bo_alloc; >> } >> @@ -1127,21 +1133,21 @@ static void >> amdgpu_buffer_set_metadata(struct pb_buffer *_buf, >> static struct pb_buffer * >> amdgpu_bo_create(struct radeon_winsys *rws, >> uint64_t size, >> unsigned alignment, >> enum radeon_bo_domain domain, >> enum radeon_bo_flag flags) >> { >> struct amdgpu_winsys *ws = amdgpu_winsys(rws); >> struct amdgpu_winsys_bo *bo; >> - unsigned usage = 0, pb_cache_bucket; >> + unsigned usage = 0, pb_cache_bucket = 0; >> /* VRAM implies WC. This is not optional. */ >> assert(!(domain & RADEON_DOMAIN_VRAM) || flags & >> RADEON_FLAG_GTT_WC); >> /* NO_CPU_ACCESS is valid with VRAM only. */ >> assert(domain == RADEON_DOMAIN_VRAM || !(flags & >> RADEON_FLAG_NO_CPU_ACCESS)); >> /* Sub-allocate small buffers from slabs. */ >> if (!(flags & (RADEON_FLAG_NO_SUBALLOC | >> RADEON_FLAG_SPARSE)) && >> size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && >> @@ -1182,48 +1188,52 @@ no_slab: >> /* This flag is irrelevant for the cache. */ >> flags &= ~RADEON_FLAG_NO_SUBALLOC; >> /* Align size to page size. This is the minimum >> alignment for normal >> * BOs. Aligning this here helps the cached bufmgr. >> Especially small BOs, >> * like constant/uniform buffers, can benefit from better >> and more reuse. >> */ >> size = align64(size, ws->info.gart_page_size); >> alignment = align(alignment, ws->info.gart_page_size); >> - int heap = radeon_get_heap_index(domain, flags); >> - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); >> - usage = 1 << heap; /* Only set one usage bit for each heap. */ >> + bool use_reusable_pool = flags & >> RADEON_FLAG_NO_INTERPROCESS_SHARING; >> - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); >> - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); >> + if (use_reusable_pool) { >> + int heap = radeon_get_heap_index(domain, flags); >> + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); >> + usage = 1 << heap; /* Only set one usage bit for each >> heap. */ >> - /* Get a buffer from the cache. */ >> - bo = (struct amdgpu_winsys_bo*) >> - pb_cache_reclaim_buffer(&ws->bo_cache, size, >> alignment, usage, >> - pb_cache_bucket); >> - if (bo) >> - return &bo->base; >> + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); >> + assert(pb_cache_bucket < >> ARRAY_SIZE(ws->bo_cache.buckets)); >> + >> + /* Get a buffer from the cache. */ >> + bo = (struct amdgpu_winsys_bo*) >> + pb_cache_reclaim_buffer(&ws->bo_cache, size, >> alignment, usage, >> + pb_cache_bucket); >> + if (bo) >> + return &bo->base; >> + } >> /* Create a new one. */ >> bo = amdgpu_create_bo(ws, size, alignment, usage, domain, >> flags, >> pb_cache_bucket); >> if (!bo) { >> /* Clear the cache and try again. */ >> pb_slabs_reclaim(&ws->bo_slabs); >> pb_cache_release_all_buffers(&ws->bo_cache); >> bo = amdgpu_create_bo(ws, size, alignment, usage, >> domain, flags, >> pb_cache_bucket); >> if (!bo) >> return NULL; >> } >> - bo->u.real.use_reusable_pool = true; >> + bo->u.real.use_reusable_pool = use_reusable_pool; >> return &bo->base; >> } >> static struct pb_buffer *amdgpu_bo_from_handle(struct >> radeon_winsys *rws, >> struct winsys_handle *whandle, >> unsigned *stride, >> unsigned *offset) >> { >> struct amdgpu_winsys *ws = amdgpu_winsys(rws); >> struct amdgpu_winsys_bo *bo; >> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c >> b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c >> index 8027a5f..15e9d38 100644 >> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c >> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c >> @@ -907,21 +907,21 @@ static void >> radeon_bo_set_metadata(struct pb_buffer *_buf, >> static struct pb_buffer * >> radeon_winsys_bo_create(struct radeon_winsys *rws, >> uint64_t size, >> unsigned alignment, >> enum radeon_bo_domain domain, >> enum radeon_bo_flag flags) >> { >> struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); >> struct radeon_bo *bo; >> - unsigned usage = 0, pb_cache_bucket; >> + unsigned usage = 0, pb_cache_bucket = 0; >> assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */ >> /* Only 32-bit sizes are supported. */ >> if (size > UINT_MAX) >> return NULL; >> /* VRAM implies WC. This is not optional. */ >> if (domain & RADEON_DOMAIN_VRAM) >> flags |= RADEON_FLAG_GTT_WC; >> @@ -962,46 +962,51 @@ no_slab: >> /* This flag is irrelevant for the cache. */ >> flags &= ~RADEON_FLAG_NO_SUBALLOC; >> /* Align size to page size. This is the minimum >> alignment for normal >> * BOs. Aligning this here helps the cached bufmgr. >> Especially small BOs, >> * like constant/uniform buffers, can benefit from >> better and more reuse. >> */ >> size = align(size, ws->info.gart_page_size); >> alignment = align(alignment, ws->info.gart_page_size); >> - int heap = radeon_get_heap_index(domain, flags); >> - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); >> - usage = 1 << heap; /* Only set one usage bit for each >> heap. */ >> + bool use_reusable_pool = flags & >> RADEON_FLAG_NO_INTERPROCESS_SHARING; >> - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); >> - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); >> + /* Shared resources don't use cached heaps. */ >> + if (use_reusable_pool) { >> + int heap = radeon_get_heap_index(domain, flags); >> + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); >> + usage = 1 << heap; /* Only set one usage bit for each >> heap. */ >> - bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, >> size, alignment, >> - usage, >> pb_cache_bucket)); >> - if (bo) >> - return &bo->base; >> + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); >> + assert(pb_cache_bucket < >> ARRAY_SIZE(ws->bo_cache.buckets)); >> + >> + bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, >> size, alignment, >> + usage, pb_cache_bucket)); >> + if (bo) >> + return &bo->base; >> + } >> bo = radeon_create_bo(ws, size, alignment, usage, >> domain, flags, >> pb_cache_bucket); >> if (!bo) { >> /* Clear the cache and try again. */ >> if (ws->info.has_virtual_memory) >> pb_slabs_reclaim(&ws->bo_slabs); >> pb_cache_release_all_buffers(&ws->bo_cache); >> bo = radeon_create_bo(ws, size, alignment, usage, >> domain, flags, >> pb_cache_bucket); >> if (!bo) >> return NULL; >> } >> - bo->u.real.use_reusable_pool = true; >> + bo->u.real.use_reusable_pool = use_reusable_pool; >> mtx_lock(&ws->bo_handles_mutex); >> util_hash_table_set(ws->bo_handles, >> (void*)(uintptr_t)bo->handle, bo); >> mtx_unlock(&ws->bo_handles_mutex); >> return &bo->base; >> } >> static struct pb_buffer *radeon_winsys_bo_from_ptr(struct >> radeon_winsys *rws, >> void *pointer, uint64_t size) >> >> >> > > > > N�n�r����)em�h�yhi×?�w^�� >