On 2017å¹´07æ??19æ?¥ 04:08, Marek Olšák wrote: > From: Marek Olšák <marek.olsak at amd.com> > > For lower overhead in the CS ioctl. > Winsys allocators are not used with interprocess-sharable resources. Hi Marek, Could I know from how your this way reduces overhead in CS ioctl? reusing BO to short bo list? Thanks, David Zhou > > v2: It shouldn't crash anymore, but the kernel will reject the new flag. > --- > src/gallium/drivers/radeon/r600_buffer_common.c | 7 +++++ > src/gallium/drivers/radeon/radeon_winsys.h | 20 +++++++++++--- > src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 36 ++++++++++++++++--------- > src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 27 +++++++++++-------- > 4 files changed, 62 insertions(+), 28 deletions(-) > > diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c > index dd1c209..2747ac4 100644 > --- a/src/gallium/drivers/radeon/r600_buffer_common.c > +++ b/src/gallium/drivers/radeon/r600_buffer_common.c > @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, > } > > /* Tiled textures are unmappable. Always put them in VRAM. */ > if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) || > res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) { > res->domains = RADEON_DOMAIN_VRAM; > res->flags |= RADEON_FLAG_NO_CPU_ACCESS | > RADEON_FLAG_GTT_WC; > } > > + /* Only displayable single-sample textures can be shared between > + * processes. */ > + if (res->b.b.target == PIPE_BUFFER || > + res->b.b.nr_samples >= 2 || > + rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY) > + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; > + > /* If VRAM is just stolen system memory, allow both VRAM and > * GTT, whichever has free space. If a buffer is evicted from > * VRAM to GTT, it will stay there. > * > * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only > * placements even with a low amount of stolen VRAM. > */ > if (!rscreen->info.has_dedicated_vram && > (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) && > res->domains == RADEON_DOMAIN_VRAM) { > diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h > index 351edcd..0abcb56 100644 > --- a/src/gallium/drivers/radeon/radeon_winsys.h > +++ b/src/gallium/drivers/radeon/radeon_winsys.h > @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */ > RADEON_DOMAIN_GTT = 2, > RADEON_DOMAIN_VRAM = 4, > RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT > }; > > enum radeon_bo_flag { /* bitfield */ > RADEON_FLAG_GTT_WC = (1 << 0), > RADEON_FLAG_NO_CPU_ACCESS = (1 << 1), > RADEON_FLAG_NO_SUBALLOC = (1 << 2), > RADEON_FLAG_SPARSE = (1 << 3), > + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4), > }; > > enum radeon_bo_usage { /* bitfield */ > RADEON_USAGE_READ = 2, > RADEON_USAGE_WRITE = 4, > RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE, > > /* The winsys ensures that the CS submission will be scheduled after > * previously flushed CSs referencing this BO in a conflicting way. > */ > @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea > default: > assert(0); > return (enum radeon_bo_domain)0; > } > } > > static inline unsigned radeon_flags_from_heap(enum radeon_heap heap) > { > switch (heap) { > case RADEON_HEAP_VRAM_NO_CPU_ACCESS: > - return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS; > + return RADEON_FLAG_GTT_WC | > + RADEON_FLAG_NO_CPU_ACCESS | > + RADEON_FLAG_NO_INTERPROCESS_SHARING; > + > case RADEON_HEAP_VRAM: > case RADEON_HEAP_VRAM_GTT: > case RADEON_HEAP_GTT_WC: > - return RADEON_FLAG_GTT_WC; > + return RADEON_FLAG_GTT_WC | > + RADEON_FLAG_NO_INTERPROCESS_SHARING; > + > case RADEON_HEAP_GTT: > default: > - return 0; > + return RADEON_FLAG_NO_INTERPROCESS_SHARING; > } > } > > /* The pb cache bucket is chosen to minimize pb_cache misses. > * It must be between 0 and 3 inclusive. > */ > static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap) > { > switch (heap) { > case RADEON_HEAP_VRAM_NO_CPU_ACCESS: > @@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap) > > /* Return the heap index for winsys allocators, or -1 on failure. */ > static inline int radeon_get_heap_index(enum radeon_bo_domain domain, > enum radeon_bo_flag flags) > { > /* VRAM implies WC (write combining) */ > assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); > /* NO_CPU_ACCESS implies VRAM only. */ > assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM); > > + /* Resources with interprocess sharing don't use any winsys allocators. */ > + if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)) > + return -1; > + > /* Unsupported flags: NO_SUBALLOC, SPARSE. */ > - if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS)) > + if (flags & ~(RADEON_FLAG_GTT_WC | > + RADEON_FLAG_NO_CPU_ACCESS | > + RADEON_FLAG_NO_INTERPROCESS_SHARING)) > return -1; > > switch (domain) { > case RADEON_DOMAIN_VRAM: > if (flags & RADEON_FLAG_NO_CPU_ACCESS) > return RADEON_HEAP_VRAM_NO_CPU_ACCESS; > else > return RADEON_HEAP_VRAM; > case RADEON_DOMAIN_VRAM_GTT: > return RADEON_HEAP_VRAM_GTT; > diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > index 97bbe23..06b8198 100644 > --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c > @@ -31,20 +31,24 @@ > > #include "amdgpu_cs.h" > > #include "os/os_time.h" > #include "state_tracker/drm_driver.h" > #include <amdgpu_drm.h> > #include <xf86drm.h> > #include <stdio.h> > #include <inttypes.h> > > +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING > +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6) > +#endif > + > /* Set to 1 for verbose output showing committed sparse buffer ranges. */ > #define DEBUG_SPARSE_COMMITS 0 > > struct amdgpu_sparse_backing_chunk { > uint32_t begin, end; > }; > > static struct pb_buffer * > amdgpu_bo_create(struct radeon_winsys *rws, > uint64_t size, > @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, > > if (initial_domain & RADEON_DOMAIN_VRAM) > request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; > if (initial_domain & RADEON_DOMAIN_GTT) > request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; > > if (flags & RADEON_FLAG_NO_CPU_ACCESS) > request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; > if (flags & RADEON_FLAG_GTT_WC) > request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; > + if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) > + request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING; > > r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); > if (r) { > fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); > fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); > fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); > fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); > goto error_bo_alloc; > } > > @@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, > > static struct pb_buffer * > amdgpu_bo_create(struct radeon_winsys *rws, > uint64_t size, > unsigned alignment, > enum radeon_bo_domain domain, > enum radeon_bo_flag flags) > { > struct amdgpu_winsys *ws = amdgpu_winsys(rws); > struct amdgpu_winsys_bo *bo; > - unsigned usage = 0, pb_cache_bucket; > + unsigned usage = 0, pb_cache_bucket = 0; > > /* VRAM implies WC. This is not optional. */ > assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); > > /* NO_CPU_ACCESS is valid with VRAM only. */ > assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS)); > > /* Sub-allocate small buffers from slabs. */ > if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && > size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && > @@ -1182,48 +1188,52 @@ no_slab: > /* This flag is irrelevant for the cache. */ > flags &= ~RADEON_FLAG_NO_SUBALLOC; > > /* Align size to page size. This is the minimum alignment for normal > * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, > * like constant/uniform buffers, can benefit from better and more reuse. > */ > size = align64(size, ws->info.gart_page_size); > alignment = align(alignment, ws->info.gart_page_size); > > - int heap = radeon_get_heap_index(domain, flags); > - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); > - usage = 1 << heap; /* Only set one usage bit for each heap. */ > + bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; > > - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); > - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); > + if (use_reusable_pool) { > + int heap = radeon_get_heap_index(domain, flags); > + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); > + usage = 1 << heap; /* Only set one usage bit for each heap. */ > > - /* Get a buffer from the cache. */ > - bo = (struct amdgpu_winsys_bo*) > - pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, > - pb_cache_bucket); > - if (bo) > - return &bo->base; > + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); > + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); > + > + /* Get a buffer from the cache. */ > + bo = (struct amdgpu_winsys_bo*) > + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, > + pb_cache_bucket); > + if (bo) > + return &bo->base; > + } > > /* Create a new one. */ > bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, > pb_cache_bucket); > if (!bo) { > /* Clear the cache and try again. */ > pb_slabs_reclaim(&ws->bo_slabs); > pb_cache_release_all_buffers(&ws->bo_cache); > bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, > pb_cache_bucket); > if (!bo) > return NULL; > } > > - bo->u.real.use_reusable_pool = true; > + bo->u.real.use_reusable_pool = use_reusable_pool; > return &bo->base; > } > > static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, > struct winsys_handle *whandle, > unsigned *stride, > unsigned *offset) > { > struct amdgpu_winsys *ws = amdgpu_winsys(rws); > struct amdgpu_winsys_bo *bo; > diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c > index 8027a5f..15e9d38 100644 > --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c > +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c > @@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf, > > static struct pb_buffer * > radeon_winsys_bo_create(struct radeon_winsys *rws, > uint64_t size, > unsigned alignment, > enum radeon_bo_domain domain, > enum radeon_bo_flag flags) > { > struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); > struct radeon_bo *bo; > - unsigned usage = 0, pb_cache_bucket; > + unsigned usage = 0, pb_cache_bucket = 0; > > assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */ > > /* Only 32-bit sizes are supported. */ > if (size > UINT_MAX) > return NULL; > > /* VRAM implies WC. This is not optional. */ > if (domain & RADEON_DOMAIN_VRAM) > flags |= RADEON_FLAG_GTT_WC; > @@ -962,46 +962,51 @@ no_slab: > /* This flag is irrelevant for the cache. */ > flags &= ~RADEON_FLAG_NO_SUBALLOC; > > /* Align size to page size. This is the minimum alignment for normal > * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, > * like constant/uniform buffers, can benefit from better and more reuse. > */ > size = align(size, ws->info.gart_page_size); > alignment = align(alignment, ws->info.gart_page_size); > > - int heap = radeon_get_heap_index(domain, flags); > - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); > - usage = 1 << heap; /* Only set one usage bit for each heap. */ > + bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; > > - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); > - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); > + /* Shared resources don't use cached heaps. */ > + if (use_reusable_pool) { > + int heap = radeon_get_heap_index(domain, flags); > + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); > + usage = 1 << heap; /* Only set one usage bit for each heap. */ > > - bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, > - usage, pb_cache_bucket)); > - if (bo) > - return &bo->base; > + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); > + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); > + > + bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, > + usage, pb_cache_bucket)); > + if (bo) > + return &bo->base; > + } > > bo = radeon_create_bo(ws, size, alignment, usage, domain, flags, > pb_cache_bucket); > if (!bo) { > /* Clear the cache and try again. */ > if (ws->info.has_virtual_memory) > pb_slabs_reclaim(&ws->bo_slabs); > pb_cache_release_all_buffers(&ws->bo_cache); > bo = radeon_create_bo(ws, size, alignment, usage, domain, flags, > pb_cache_bucket); > if (!bo) > return NULL; > } > > - bo->u.real.use_reusable_pool = true; > + bo->u.real.use_reusable_pool = use_reusable_pool; > > mtx_lock(&ws->bo_handles_mutex); > util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); > mtx_unlock(&ws->bo_handles_mutex); > > return &bo->base; > } > > static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, > void *pointer, uint64_t size)