Clients like i915 need to segregate cache domains within the GTT which can lead to small amounts of fragmentation. By allocating the uncached buffers from the bottom and the cacheable buffers from the top, we can reduce the amount of wasted space and also optimize allocation of the mappable portion of the GTT to only those buffers that require CPU access through the GTT. For other drivers, allocating small bos from one end and large ones from the other helps improve the quality of fragmentation. Only Radeon has behavioral changes in this patch, as I have no Intel hw. Based on drm_mm work by Chris Wilson. -- Radeon uses a 512kb threshold. This decreases eviction by up to 20%, by improving the fragmentation quality. No harm in normal cases that fit VRAM fully (PTS gaming suite). In some cases, even the VRAM-fitting cases improved slightly (openarena, urban terror). 512kb was measured as the most optimal threshold for 3d workloads common to radeon. Other drivers may need different thresholds according to their workloads. v2: Updated kerneldoc in more places Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Ben Widawsky <ben@xxxxxxxxxxxx> Signed-off-by: Lauri Kasanen <cand@xxxxxxx> --- drivers/gpu/drm/ast/ast_ttm.c | 3 +- drivers/gpu/drm/bochs/bochs_mm.c | 3 +- drivers/gpu/drm/cirrus/cirrus_ttm.c | 3 +- drivers/gpu/drm/drm_mm.c | 66 ++++++++++++++++++++++++++--------- drivers/gpu/drm/i915/i915_gem.c | 3 +- drivers/gpu/drm/i915/i915_gem_gtt.c | 3 +- drivers/gpu/drm/mgag200/mgag200_ttm.c | 3 +- drivers/gpu/drm/nouveau/nouveau_ttm.c | 2 +- drivers/gpu/drm/qxl/qxl_ttm.c | 2 +- drivers/gpu/drm/radeon/radeon_ttm.c | 3 +- drivers/gpu/drm/ttm/ttm_bo.c | 4 ++- drivers/gpu/drm/ttm/ttm_bo_manager.c | 16 +++++++-- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +- include/drm/drm_mm.h | 32 ++++++++++++++--- include/drm/ttm/ttm_bo_driver.h | 9 ++++- 15 files changed, 118 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/ast/ast_ttm.c b/drivers/gpu/drm/ast/ast_ttm.c index b824622..61f9e39 100644 --- a/drivers/gpu/drm/ast/ast_ttm.c +++ b/drivers/gpu/drm/ast/ast_ttm.c @@ -262,7 +262,8 @@ int ast_mm_init(struct ast_private *ast) &ast_bo_driver, dev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - true); + true, + 0); if (ret) { DRM_ERROR("Error initialising bo driver; %d\n", ret); return ret; diff --git a/drivers/gpu/drm/bochs/bochs_mm.c b/drivers/gpu/drm/bochs/bochs_mm.c index f488be5..9dfd24a 100644 --- a/drivers/gpu/drm/bochs/bochs_mm.c +++ b/drivers/gpu/drm/bochs/bochs_mm.c @@ -228,7 +228,8 @@ int bochs_mm_init(struct bochs_device *bochs) &bochs_bo_driver, bochs->dev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - true); + true, + 0); if (ret) { DRM_ERROR("Error initialising bo driver; %d\n", ret); return ret; diff --git a/drivers/gpu/drm/cirrus/cirrus_ttm.c b/drivers/gpu/drm/cirrus/cirrus_ttm.c index 92e6b77..74e8e21 100644 --- a/drivers/gpu/drm/cirrus/cirrus_ttm.c +++ b/drivers/gpu/drm/cirrus/cirrus_ttm.c @@ -262,7 +262,8 @@ int cirrus_mm_init(struct cirrus_device *cirrus) &cirrus_bo_driver, dev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - true); + true, + 0); if (ret) { DRM_ERROR("Error initialising bo driver; %d\n", ret); return ret; diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index a2d45b74..8f64be4 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -82,6 +82,10 @@ * this to implement guard pages between incompatible caching domains in the * graphics TT. * + * Two behaviors are supported for searching and allocating: bottom-up and top-down. + * The default is bottom-up. Top-down allocation can be used if the memory area + * has different restrictions, or just to reduce fragmentation. + * * Finally iteration helpers to walk all nodes and all holes are provided as are * some basic allocator dumpers for debugging. */ @@ -102,7 +106,8 @@ static struct drm_mm_node *drm_mm_search_free_in_range_generic(const struct drm_ static void drm_mm_insert_helper(struct drm_mm_node *hole_node, struct drm_mm_node *node, unsigned long size, unsigned alignment, - unsigned long color) + unsigned long color, + enum drm_mm_allocator_flags flags) { struct drm_mm *mm = hole_node->mm; unsigned long hole_start = drm_mm_hole_node_start(hole_node); @@ -115,12 +120,22 @@ static void drm_mm_insert_helper(struct drm_mm_node *hole_node, if (mm->color_adjust) mm->color_adjust(hole_node, color, &adj_start, &adj_end); + if (flags & DRM_MM_CREATE_TOP) + adj_start = adj_end - size; + if (alignment) { unsigned tmp = adj_start % alignment; - if (tmp) - adj_start += alignment - tmp; + if (tmp) { + if (flags & DRM_MM_CREATE_TOP) + adj_start -= tmp; + else + adj_start += alignment - tmp; + } } + BUG_ON(adj_start < hole_start); + BUG_ON(adj_end > hole_end); + if (adj_start == hole_start) { hole_node->hole_follows = 0; list_del(&hole_node->hole_stack); @@ -205,7 +220,8 @@ EXPORT_SYMBOL(drm_mm_reserve_node); * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for this node - * @flags: flags to fine-tune the allocation + * @sflags: flags to fine-tune the allocation search + * @aflags: flags to fine-tune the allocation behavior * * The preallocated node must be cleared to 0. * @@ -215,16 +231,17 @@ EXPORT_SYMBOL(drm_mm_reserve_node); int drm_mm_insert_node_generic(struct drm_mm *mm, struct drm_mm_node *node, unsigned long size, unsigned alignment, unsigned long color, - enum drm_mm_search_flags flags) + enum drm_mm_search_flags sflags, + enum drm_mm_allocator_flags aflags) { struct drm_mm_node *hole_node; hole_node = drm_mm_search_free_generic(mm, size, alignment, - color, flags); + color, sflags); if (!hole_node) return -ENOSPC; - drm_mm_insert_helper(hole_node, node, size, alignment, color); + drm_mm_insert_helper(hole_node, node, size, alignment, color, aflags); return 0; } EXPORT_SYMBOL(drm_mm_insert_node_generic); @@ -233,7 +250,8 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, struct drm_mm_node *node, unsigned long size, unsigned alignment, unsigned long color, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + enum drm_mm_allocator_flags flags) { struct drm_mm *mm = hole_node->mm; unsigned long hole_start = drm_mm_hole_node_start(hole_node); @@ -248,13 +266,20 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, if (adj_end > end) adj_end = end; + if (flags & DRM_MM_CREATE_TOP) + adj_start = adj_end - size; + if (mm->color_adjust) mm->color_adjust(hole_node, color, &adj_start, &adj_end); if (alignment) { unsigned tmp = adj_start % alignment; - if (tmp) - adj_start += alignment - tmp; + if (tmp) { + if (flags & DRM_MM_CREATE_TOP) + adj_start -= tmp; + else + adj_start += alignment - tmp; + } } if (adj_start == hole_start) { @@ -271,6 +296,8 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, INIT_LIST_HEAD(&node->hole_stack); list_add(&node->node_list, &hole_node->node_list); + BUG_ON(node->start < start); + BUG_ON(node->start < adj_start); BUG_ON(node->start + node->size > adj_end); BUG_ON(node->start + node->size > end); @@ -290,7 +317,8 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, * @color: opaque tag value to use for this node * @start: start of the allowed range for this node * @end: end of the allowed range for this node - * @flags: flags to fine-tune the allocation + * @sflags: flags to fine-tune the allocation search + * @aflags: flags to fine-tune the allocation behavior * * The preallocated node must be cleared to 0. * @@ -298,21 +326,23 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, * 0 on success, -ENOSPC if there's no suitable hole. */ int drm_mm_insert_node_in_range_generic(struct drm_mm *mm, struct drm_mm_node *node, - unsigned long size, unsigned alignment, unsigned long color, + unsigned long size, unsigned alignment, + unsigned long color, unsigned long start, unsigned long end, - enum drm_mm_search_flags flags) + enum drm_mm_search_flags sflags, + enum drm_mm_allocator_flags aflags) { struct drm_mm_node *hole_node; hole_node = drm_mm_search_free_in_range_generic(mm, size, alignment, color, - start, end, flags); + start, end, sflags); if (!hole_node) return -ENOSPC; drm_mm_insert_helper_range(hole_node, node, size, alignment, color, - start, end); + start, end, aflags); return 0; } EXPORT_SYMBOL(drm_mm_insert_node_in_range_generic); @@ -391,7 +421,8 @@ static struct drm_mm_node *drm_mm_search_free_generic(const struct drm_mm *mm, best = NULL; best_size = ~0UL; - drm_mm_for_each_hole(entry, mm, adj_start, adj_end) { + __drm_mm_for_each_hole(entry, mm, adj_start, adj_end, + flags & DRM_MM_SEARCH_BELOW) { if (mm->color_adjust) { mm->color_adjust(entry, color, &adj_start, &adj_end); if (adj_end <= adj_start) @@ -432,7 +463,8 @@ static struct drm_mm_node *drm_mm_search_free_in_range_generic(const struct drm_ best = NULL; best_size = ~0UL; - drm_mm_for_each_hole(entry, mm, adj_start, adj_end) { + __drm_mm_for_each_hole(entry, mm, adj_start, adj_end, + flags & DRM_MM_SEARCH_BELOW) { if (adj_start < start) adj_start = start; if (adj_end > end) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 9c52f68..489bc33 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3270,7 +3270,8 @@ search_free: ret = drm_mm_insert_node_in_range_generic(&vm->mm, &vma->node, size, alignment, obj->cache_level, 0, gtt_max, - DRM_MM_SEARCH_DEFAULT); + DRM_MM_SEARCH_DEFAULT, + DRM_MM_CREATE_DEFAULT); if (ret) { ret = i915_gem_evict_something(dev, vm, size, alignment, obj->cache_level, flags); diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 63a6dc7..dd72d35 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1072,7 +1072,8 @@ alloc: &ppgtt->node, GEN6_PD_SIZE, GEN6_PD_ALIGN, 0, 0, dev_priv->gtt.base.total, - DRM_MM_SEARCH_DEFAULT); + DRM_MM_SEARCH_DEFAULT, + DRM_MM_CREATE_DEFAULT); if (ret == -ENOSPC && !retried) { ret = i915_gem_evict_something(dev, &dev_priv->gtt.base, GEN6_PD_SIZE, GEN6_PD_ALIGN, diff --git a/drivers/gpu/drm/mgag200/mgag200_ttm.c b/drivers/gpu/drm/mgag200/mgag200_ttm.c index 5a00e90..6844b24 100644 --- a/drivers/gpu/drm/mgag200/mgag200_ttm.c +++ b/drivers/gpu/drm/mgag200/mgag200_ttm.c @@ -262,7 +262,8 @@ int mgag200_mm_init(struct mga_device *mdev) &mgag200_bo_driver, dev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - true); + true, + 0); if (ret) { DRM_ERROR("Error initialising bo driver; %d\n", ret); return ret; diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index ab0228f..3fef97c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -384,7 +384,7 @@ nouveau_ttm_init(struct nouveau_drm *drm) &nouveau_bo_driver, dev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - bits <= 32 ? true : false); + bits <= 32 ? true : false, 0); if (ret) { NV_ERROR(drm, "error initialising bo driver, %d\n", ret); return ret; diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c index 29c02e0..8401a00 100644 --- a/drivers/gpu/drm/qxl/qxl_ttm.c +++ b/drivers/gpu/drm/qxl/qxl_ttm.c @@ -495,7 +495,7 @@ int qxl_ttm_init(struct qxl_device *qdev) qdev->mman.bo_global_ref.ref.object, &qxl_bo_driver, qdev->ddev->anon_inode->i_mapping, - DRM_FILE_PAGE_OFFSET, 0); + DRM_FILE_PAGE_OFFSET, 0, 0); if (r) { DRM_ERROR("failed initializing buffer object driver(%d).\n", r); return r; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index c8a8a51..1aef339 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -710,7 +710,8 @@ int radeon_ttm_init(struct radeon_device *rdev) &radeon_bo_driver, rdev->ddev->anon_inode->i_mapping, DRM_FILE_PAGE_OFFSET, - rdev->need_dma32); + rdev->need_dma32, + 512 * 1024); if (r) { DRM_ERROR("failed initializing buffer object driver(%d).\n", r); return r; diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 9df79ac..caf7cd3 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1453,7 +1453,8 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_bo_driver *driver, struct address_space *mapping, uint64_t file_page_offset, - bool need_dma32) + bool need_dma32, + uint32_t alloc_threshold) { int ret = -EINVAL; @@ -1476,6 +1477,7 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->dev_mapping = mapping; bdev->glob = glob; bdev->need_dma32 = need_dma32; + bdev->alloc_threshold = alloc_threshold; bdev->val_seq = 0; spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); diff --git a/drivers/gpu/drm/ttm/ttm_bo_manager.c b/drivers/gpu/drm/ttm/ttm_bo_manager.c index c58eba33..db9fcb4 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_manager.c +++ b/drivers/gpu/drm/ttm/ttm_bo_manager.c @@ -55,6 +55,7 @@ static int ttm_bo_man_get_node(struct ttm_mem_type_manager *man, struct ttm_range_manager *rman = (struct ttm_range_manager *) man->priv; struct drm_mm *mm = &rman->mm; struct drm_mm_node *node = NULL; + enum drm_mm_allocator_flags aflags = DRM_MM_CREATE_DEFAULT; unsigned long lpfn; int ret; @@ -65,12 +66,21 @@ static int ttm_bo_man_get_node(struct ttm_mem_type_manager *man, node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; + /** + * If the driver requested a threshold, use two-ended allocation. + * Pinned buffers require bottom-up allocation. + */ + if (man->bdev->alloc_threshold && + !(bo->mem.placement & TTM_PL_FLAG_NO_EVICT) && + man->bdev->alloc_threshold < (mem->num_pages * PAGE_SIZE)) + aflags = DRM_MM_CREATE_TOP; spin_lock(&rman->lock); - ret = drm_mm_insert_node_in_range(mm, node, mem->num_pages, - mem->page_alignment, + ret = drm_mm_insert_node_in_range_generic(mm, node, mem->num_pages, + mem->page_alignment, 0, placement->fpfn, lpfn, - DRM_MM_SEARCH_BEST); + DRM_MM_SEARCH_BEST, + aflags); spin_unlock(&rman->lock); if (unlikely(ret)) { diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index c700958..59a50dd 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -725,7 +725,7 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset) &vmw_bo_driver, dev->anon_inode->i_mapping, VMWGFX_FILE_PAGE_OFFSET, - false); + false, 0); if (unlikely(ret != 0)) { DRM_ERROR("Failed initializing TTM buffer object driver.\n"); goto out_err1; diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h index 8b6981a..a24addf 100644 --- a/include/drm/drm_mm.h +++ b/include/drm/drm_mm.h @@ -47,8 +47,17 @@ enum drm_mm_search_flags { DRM_MM_SEARCH_DEFAULT = 0, DRM_MM_SEARCH_BEST = 1 << 0, + DRM_MM_SEARCH_BELOW = 1 << 1, }; +enum drm_mm_allocator_flags { + DRM_MM_CREATE_DEFAULT = 0, + DRM_MM_CREATE_TOP = 1 << 0, +}; + +#define DRM_MM_BOTTOMUP DRM_MM_SEARCH_DEFAULT, DRM_MM_CREATE_DEFAULT +#define DRM_MM_TOPDOWN DRM_MM_SEARCH_BELOW, DRM_MM_CREATE_TOP + struct drm_mm_node { struct list_head node_list; struct list_head hole_stack; @@ -186,6 +195,9 @@ static inline unsigned long drm_mm_hole_node_end(struct drm_mm_node *hole_node) * Implementation Note: * We need to inline list_for_each_entry in order to be able to set hole_start * and hole_end on each iteration while keeping the macro sane. + * + * The __drm_mm_for_each_hole version is similar, but with added support for + * going backwards. */ #define drm_mm_for_each_hole(entry, mm, hole_start, hole_end) \ for (entry = list_entry((mm)->hole_stack.next, struct drm_mm_node, hole_stack); \ @@ -195,6 +207,14 @@ static inline unsigned long drm_mm_hole_node_end(struct drm_mm_node *hole_node) 1 : 0; \ entry = list_entry(entry->hole_stack.next, struct drm_mm_node, hole_stack)) +#define __drm_mm_for_each_hole(entry, mm, hole_start, hole_end, backwards) \ + for (entry = list_entry((backwards) ? (mm)->hole_stack.prev : (mm)->hole_stack.next, struct drm_mm_node, hole_stack); \ + &entry->hole_stack != &(mm)->hole_stack ? \ + hole_start = drm_mm_hole_node_start(entry), \ + hole_end = drm_mm_hole_node_end(entry), \ + 1 : 0; \ + entry = list_entry((backwards) ? entry->hole_stack.prev : entry->hole_stack.next, struct drm_mm_node, hole_stack)) + /* * Basic range manager support (drm_mm.c) */ @@ -205,7 +225,8 @@ int drm_mm_insert_node_generic(struct drm_mm *mm, unsigned long size, unsigned alignment, unsigned long color, - enum drm_mm_search_flags flags); + enum drm_mm_search_flags sflags, + enum drm_mm_allocator_flags aflags); /** * drm_mm_insert_node - search for space and insert @node * @mm: drm_mm to allocate from @@ -228,7 +249,8 @@ static inline int drm_mm_insert_node(struct drm_mm *mm, unsigned alignment, enum drm_mm_search_flags flags) { - return drm_mm_insert_node_generic(mm, node, size, alignment, 0, flags); + return drm_mm_insert_node_generic(mm, node, size, alignment, 0, flags, + DRM_MM_CREATE_DEFAULT); } int drm_mm_insert_node_in_range_generic(struct drm_mm *mm, @@ -238,7 +260,8 @@ int drm_mm_insert_node_in_range_generic(struct drm_mm *mm, unsigned long color, unsigned long start, unsigned long end, - enum drm_mm_search_flags flags); + enum drm_mm_search_flags sflags, + enum drm_mm_allocator_flags aflags); /** * drm_mm_insert_node_in_range - ranged search for space and insert @node * @mm: drm_mm to allocate from @@ -266,7 +289,8 @@ static inline int drm_mm_insert_node_in_range(struct drm_mm *mm, enum drm_mm_search_flags flags) { return drm_mm_insert_node_in_range_generic(mm, node, size, alignment, - 0, start, end, flags); + 0, start, end, flags, + DRM_MM_CREATE_DEFAULT); } void drm_mm_remove_node(struct drm_mm_node *node); diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 5d8aabe..4924c487 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -527,6 +527,8 @@ struct ttm_bo_global { * @dev_mapping: A pointer to the struct address_space representing the * device address space. * @wq: Work queue structure for the delayed delete workqueue. + * @alloc_threshold: If non-zero, use this as the threshold for two-ended + * allocation. * */ @@ -565,6 +567,7 @@ struct ttm_bo_device { struct delayed_work wq; bool need_dma32; + uint32_t alloc_threshold; }; /** @@ -751,6 +754,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev); * @file_page_offset: Offset into the device address space that is available * for buffer data. This ensures compatibility with other users of the * address space. + * @alloc_threshold: If non-zero, use this as the threshold for two-ended + * allocation. * * Initializes a struct ttm_bo_device: * Returns: @@ -760,7 +765,9 @@ extern int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_bo_global *glob, struct ttm_bo_driver *driver, struct address_space *mapping, - uint64_t file_page_offset, bool need_dma32); + uint64_t file_page_offset, + bool need_dma32, + uint32_t alloc_threshold); /** * ttm_bo_unmap_virtual -- 1.8.3.1 _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/dri-devel