This improves the performance of Mesa's GL_MAP_UNSYNCHRONIZED_BIT path in GL_ARB_map_buffer_range. Improves Unigine Tropics performance at 1024x768 by 2.06236% +/- 0.50272% (n=11). --- intel/intel_bufmgr.h | 2 + intel/intel_bufmgr_gem.c | 72 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 85da8b9..e852eab 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -148,8 +148,10 @@ void drm_intel_bufmgr_gem_enable_reuse(drm_intel_bufmgr *bufmgr); void drm_intel_bufmgr_gem_enable_fenced_relocs(drm_intel_bufmgr *bufmgr); void drm_intel_bufmgr_gem_set_vma_cache_size(drm_intel_bufmgr *bufmgr, int limit); +int drm_intel_gem_bo_map_unsynchronized(drm_intel_bo *bo); int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo); int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo); + int drm_intel_gem_bo_get_reloc_count(drm_intel_bo *bo); void drm_intel_gem_bo_clear_relocs(drm_intel_bo *bo, int start); void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable); diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 187e8ec..12641e1 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -1150,15 +1150,13 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable) return 0; } -int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) +static int +map_gtt(drm_intel_bo *bo) { drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; - struct drm_i915_gem_set_domain set_domain; int ret; - pthread_mutex_lock(&bufmgr_gem->lock); - if (bo_gem->map_count++ == 0) drm_intel_gem_bo_open_vma(bufmgr_gem, bo_gem); @@ -1184,7 +1182,6 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) strerror(errno)); if (--bo_gem->map_count == 0) drm_intel_gem_bo_close_vma(bufmgr_gem, bo_gem); - pthread_mutex_unlock(&bufmgr_gem->lock); return ret; } @@ -1201,7 +1198,6 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) strerror(errno)); if (--bo_gem->map_count == 0) drm_intel_gem_bo_close_vma(bufmgr_gem, bo_gem); - pthread_mutex_unlock(&bufmgr_gem->lock); return ret; } } @@ -1211,7 +1207,33 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) DBG("bo_map_gtt: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name, bo_gem->gtt_virtual); - /* Now move it to the GTT domain so that the CPU caches are flushed */ + return 0; +} + +int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + struct drm_i915_gem_set_domain set_domain; + int ret; + + pthread_mutex_lock(&bufmgr_gem->lock); + + ret = map_gtt(bo); + if (ret) { + pthread_mutex_unlock(&bufmgr_gem->lock); + return ret; + } + + /* Now move it to the GTT domain so that the GPU and CPU + * caches are flushed and the GPU isn't actively using the + * buffer. + * + * The pagefault handler does this domain change for us when + * it has unbound the BO from the GTT, but it's up to us to + * tell it when we're about to use things if we had done + * rendering and it still happens to be bound to the GTT. + */ set_domain.handle = bo_gem->gem_handle; set_domain.read_domains = I915_GEM_DOMAIN_GTT; set_domain.write_domain = I915_GEM_DOMAIN_GTT; @@ -1229,6 +1251,42 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo) return 0; } +/** + * Performs a mapping of the buffer object like the normal GTT + * mapping, but avoiding waiting for the GPU to be done reading from + * or rendering to the buffer. + * + * This is used in the implementation of GL_ARB_map_buffer_range: The + * user asks to create a buffer, then does a mapping, fills some + * space, runs a drawing command, then asks to map it again without + * synchronizing because it guarantees that it won't write over the + * data that the GPU is busy using (or, more specifically, that if it + * does write over the data, it acknowledges that rendering is + * undefined). + */ + +int drm_intel_gem_bo_map_unsynchronized(drm_intel_bo *bo) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + int ret; + + /* If the CPU cache isn't coherent with the GTT, then use a + * regular synchronized mapping. The problem is that we don't + * track where the buffer was last used on the CPU side in + * terms of drm_intel_bo_map vs drm_intel_gem_bo_map_gtt, so + * we would potentially corrupt the buffer even when the user + * does reasonable things. + */ + if (!bufmgr_gem->has_llc) + return drm_intel_gem_bo_map_gtt(bo); + + pthread_mutex_lock(&bufmgr_gem->lock); + ret = map_gtt(bo); + pthread_mutex_unlock(&bufmgr_gem->lock); + + return ret; +} + static int drm_intel_gem_bo_unmap(drm_intel_bo *bo) { drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; -- 1.7.9.1