[RFC] libdrm_intel: Rework BO allocs to avoid rounding up to bucket size

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Garry Lancaster <garry.lancaster@xxxxxxxxx>

libdrm includes a scheme where freed buffer objects (BOs)
are held in a cache. This allows incoming allocation requests to be
serviced by re-using an old BO, instead of requiring a new
object to be allocated. This is a performance enhancement.
The cache is divided into "buckets". Each bucket holds unused
BOs of a pre-determined size. When a BO allocation request is seen,
the bucket for BOs of this size or larger is selected. Any BO
currently in the bucket will be re-used for the allocation. If the
bucket was empty, a new BO is created. However, the BO is created
with the size determined by the selected bucket (i.e. the size is
rounded up to the bucket size), rather than being created with the
originally requested size. This is so that when the BO is freed,
it can be released into the bucket and re-used by any other allocation
which selects the same bucket.

Depending upon the size of the allocation, this rounding up can
result in a significant wastage of memory when allocating a BO. For
example, a BO request just over 132K allocated during GLES context
creation was rounded up to the next bucket size of 160K. Such wastage
can be critical on devices with low memory.

This commit reworks the BO allocation code. On a BO allocation request,
if the selected bucket contains any BOs, each of them is checked to
see if any is large enough to fulfill the allocation request. If not,
a new BO is created, but (due to the new check) it is no longer
necessary to round up its size to match the size determined by the
selected bucket.

So, previously, buckets contained BOs that were all the same size. But now
the BOs in a bucket can be different sizes: in the range from the size of the
next smaller, nominal, bucket size to the current, nominal, bucket size.

On a 1GB system, the following reductions in BO memory usage were seen:

BaseMark X 1.0:                324.4MB -> 306.0MB (-18.4MB;  5.7% saving)
BaseMark X 1.1 Medium Quality: 206.9MB -> 201.2MB (- 5.7MB;  2.8% saving)
GFXBench 3.0 TRex:             216.6MB -> 200.0MB (-16.6MB;  8.3% saving)
GFXBench 3.0 Manhattan:        281.4MB -> 246.8MB (-34.6MB; 12.3% saving)

No performance change was seen on BaseMarkX. GFXBench 3.0 showed small
performance increases (~0.5fps on Manhattan, ~1-2fps on TRex) which may be
due to reduced activity of the OOM killer.

Change-Id: I83e07819944899df30613ca89acf907f8e9bfe13
Signed-off-by: Matthias Dejaegher <matthias.dejaegher@xxxxxxxxx>
Reviewed-by: Lancaster, Garry <garry.lancaster@xxxxxxxxx>
Signed-off-by: Arun Siluvery <arun.siluvery@xxxxxxxxxxxxxxx>
---
 intel/intel_bufmgr_gem.c | 134 +++++++++++++++++++++++++++++------------------
 1 file changed, 83 insertions(+), 51 deletions(-)

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 0e1cb0d..d80ecb5 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -1,7 +1,7 @@
 /**************************************************************************
  *
  * Copyright � 2007 Red Hat Inc.
- * Copyright � 2007-2012 Intel Corporation
+ * Copyright � 2007-2014 Intel Corporation
  * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
  * All Rights Reserved.
  *
@@ -636,6 +636,31 @@ drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
 	}
 }
 
+static void
+drm_intel_gem_empty_bo_cache(drm_intel_bufmgr_gem *bufmgr_gem)
+{
+	pthread_mutex_lock(&bufmgr_gem->lock);
+
+	int i;
+
+	for (i = 0; i < bufmgr_gem->num_buckets; i++) {
+		struct drm_intel_gem_bo_bucket *bucket =
+		    &bufmgr_gem->cache_bucket[i];
+
+		while (!DRMLISTEMPTY(&bucket->head)) {
+			drm_intel_bo_gem *bo_gem;
+
+			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
+					      bucket->head.next, head);
+
+			DRMLISTDEL(&bo_gem->head);
+			drm_intel_gem_bo_free(&bo_gem->bo);
+		}
+	}
+
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+}
+
 static drm_intel_bo *
 drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 				const char *name,
@@ -649,54 +674,56 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 	unsigned int page_size = getpagesize();
 	int ret;
 	struct drm_intel_gem_bo_bucket *bucket;
+	struct _drmMMListHead *entry;
+	struct _drmMMListHead *temp;
 	bool alloc_from_cache;
-	unsigned long bo_size;
 	bool for_render = false;
 
 	if (flags & BO_ALLOC_FOR_RENDER)
 		for_render = true;
 
-	/* Round the allocated size up to a power of two number of pages. */
 	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
 
-	/* If we don't have caching at this size, don't actually round the
-	 * allocation up.
-	 */
-	if (bucket == NULL) {
-		bo_size = size;
-		if (bo_size < page_size)
-			bo_size = page_size;
-	} else {
-		bo_size = bucket->size;
-	}
-
 	pthread_mutex_lock(&bufmgr_gem->lock);
 	/* Get a buffer out of the cache if available */
 retry:
 	alloc_from_cache = false;
 	if (bucket != NULL && !DRMLISTEMPTY(&bucket->head)) {
 		if (for_render) {
-			/* Allocate new render-target BOs from the tail (MRU)
-			 * of the list, as it will likely be hot in the GPU
+			/* Search from the tail (MRU) of the list to allocate
+			 * render-target BOs, as they will likely be hot in the GPU
 			 * cache and in the aperture for us.
 			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.prev, head);
-			DRMLISTDEL(&bo_gem->head);
-			alloc_from_cache = true;
+			DRMLISTFOREACHSAFEREVERSE(entry, temp, &bucket->head)
+			{
+			    bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
+					entry, head);
+
+			    if (bo_gem->bo.size >= size) {
+					DRMLISTDEL(&bo_gem->head);
+					alloc_from_cache = true;
+					break;
+			    }
+			}
 		} else {
 			/* For non-render-target BOs (where we're probably
 			 * going to map it first thing in order to fill it
-			 * with data), check if the last BO in the cache is
-			 * unbusy, and only reuse in that case. Otherwise,
-			 * allocating a new buffer is probably faster than
-			 * waiting for the GPU to finish.
+			 * with data), search from the head (LRU) of the list
+			 * and check if the BO is unbusy, and only reuse in that
+			 * case. Otherwise, allocating a new buffer is probably
+			 * faster than waiting for the GPU to finish.
 			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			if (!drm_intel_gem_bo_busy(&bo_gem->bo)) {
-				alloc_from_cache = true;
-				DRMLISTDEL(&bo_gem->head);
+			DRMLISTFOREACHSAFE(entry, temp, &bucket->head)
+			{
+			    bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
+					entry, head);
+
+			    if ((bo_gem->bo.size >= size) &&
+				!drm_intel_gem_bo_busy(&bo_gem->bo)) {
+					DRMLISTDEL(&bo_gem->head);
+					alloc_from_cache = true;
+					break;
+			    }
 			}
 		}
 
@@ -726,20 +753,38 @@ retry:
 		if (!bo_gem)
 			return NULL;
 
-		bo_gem->bo.size = bo_size;
+		bo_gem->bo.size = size;
 
 		VG_CLEAR(create);
-		create.size = bo_size;
+		create.size = size;
 
 		ret = drmIoctl(bufmgr_gem->fd,
 			       DRM_IOCTL_I915_GEM_CREATE,
 			       &create);
-		bo_gem->gem_handle = create.handle;
-		bo_gem->bo.handle = bo_gem->gem_handle;
+
 		if (ret != 0) {
-			free(bo_gem);
-			return NULL;
+			/* If allocation failed, clear the cache and retry.
+			 * Kernel has probably reclaimed any cached BOs already,
+			 * but may as well retry after emptying the buckets.
+			 */
+			drm_intel_gem_empty_bo_cache(bufmgr_gem);
+
+			VG_CLEAR(create);
+			create.size = size;
+
+			ret = drmIoctl(bufmgr_gem->fd,
+				       DRM_IOCTL_I915_GEM_CREATE,
+				       &create);
+
+			if (ret != 0) {
+				free(bo_gem);
+				bo_gem = NULL;
+				return NULL;
+			}
 		}
+
+		bo_gem->gem_handle = create.handle;
+		bo_gem->bo.handle = bo_gem->gem_handle;
 		bo_gem->bo.bufmgr = bufmgr;
 
 		bo_gem->tiling_mode = I915_TILING_NONE;
@@ -1123,6 +1168,7 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 	DRMLISTDEL(&bo_gem->name_list);
 
 	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, bo->size);
+
 	/* Put the buffer into our internal cache for reuse if we can. */
 	if (bufmgr_gem->bo_reuse && bo_gem->reusable && bucket != NULL &&
 	    drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
@@ -1617,29 +1663,15 @@ static void
 drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
-	int i;
 
 	free(bufmgr_gem->exec2_objects);
 	free(bufmgr_gem->exec_objects);
 	free(bufmgr_gem->exec_bos);
 	free(bufmgr_gem->aub_filename);
 
-	pthread_mutex_destroy(&bufmgr_gem->lock);
-
-	/* Free any cached buffer objects we were going to reuse */
-	for (i = 0; i < bufmgr_gem->num_buckets; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-		drm_intel_bo_gem *bo_gem;
+	drm_intel_gem_empty_bo_cache(bufmgr_gem);
 
-		while (!DRMLISTEMPTY(&bucket->head)) {
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			DRMLISTDEL(&bo_gem->head);
-
-			drm_intel_gem_bo_free(&bo_gem->bo);
-		}
-	}
+	pthread_mutex_destroy(&bufmgr_gem->lock);
 
 	free(bufmgr);
 }
-- 
2.0.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux