On Wed, 2023-02-15 at 18:42 +0100, Christian König wrote: > Am 15.02.23 um 17:13 schrieb Thomas Hellström: > > When swapping out, we will split multi-order pages both in order to > > move them to the swap-cache and to be able to return memory to the > > swap cache as soon as possible on a page-by-page basis. > > By reducing the page max order to the system PMD size, we can be > > nicer > > to the system and avoid splitting gigantic pages. > > > > On top of this we also > > include the 64K page size in the page sizes tried, since that > > appears to > > be a common size for GPU applications. > > Please completely drop that. You mean the 64K page size, or the whole patch? > This is just nonsense spilling in from the > Windows drivers. Agreed, but IIRC on the last RFC you asked me not to drop the 64K pages, so that's why they are here. I can remove them if needed. The only reason for keeping them from a performance point of view is better efficiency on GPUs with 64K page size if not using a coalescing IOMMU for dma-mapping. Let me know what you think is best and I'll adjust accordingly. /Thomas > > Christian. > > > > > Looking forward to when we might be able to swap out PMD size > > folios > > without splitting, this will also be a benefit. > > > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > > --- > > drivers/gpu/drm/ttm/ttm_pool.c | 58 ++++++++++++++++++++++++++--- > > ----- > > 1 file changed, 45 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/gpu/drm/ttm/ttm_pool.c > > b/drivers/gpu/drm/ttm/ttm_pool.c > > index 1cc7591a9542..8787fb6a218b 100644 > > --- a/drivers/gpu/drm/ttm/ttm_pool.c > > +++ b/drivers/gpu/drm/ttm/ttm_pool.c > > @@ -31,6 +31,8 @@ > > * cause they are rather slow compared to alloc_pages+map. > > */ > > > > +#define pr_fmt(fmt) "[TTM POOL] " fmt > > + > > #include <linux/module.h> > > #include <linux/dma-mapping.h> > > #include <linux/debugfs.h> > > @@ -47,6 +49,18 @@ > > > > #include "ttm_module.h" > > > > +#define TTM_MAX_ORDER (PMD_SHIFT - PAGE_SHIFT) > > +#define TTM_64K_ORDER (16 - PAGE_SHIFT) > > +#if (TTM_MAX_ORDER < TTM_64K_ORDER) > > +#undef TTM_MAX_ORDER > > +#define TTM_MAX_ORDER TTM_64K_ORDER > > +#endif > > +#if ((MAX_ORDER - 1) < TTM_MAX_ORDER) > > +#undef TTM_MAX_ORDER > > +#define TTM_MAX_ORDER (MAX_ORDER - 1) > > +#endif > > +#define TTM_DIM_ORDER (TTM_MAX_ORDER + 1) > > + > > /** > > * struct ttm_pool_dma - Helper object for coherent DMA mappings > > * > > @@ -65,16 +79,18 @@ module_param(page_pool_size, ulong, 0644); > > > > static atomic_long_t allocated_pages; > > > > -static struct ttm_pool_type global_write_combined[MAX_ORDER]; > > -static struct ttm_pool_type global_uncached[MAX_ORDER]; > > +static struct ttm_pool_type global_write_combined[TTM_DIM_ORDER]; > > +static struct ttm_pool_type global_uncached[TTM_DIM_ORDER]; > > > > -static struct ttm_pool_type > > global_dma32_write_combined[MAX_ORDER]; > > -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER]; > > +static struct ttm_pool_type > > global_dma32_write_combined[TTM_DIM_ORDER]; > > +static struct ttm_pool_type global_dma32_uncached[TTM_DIM_ORDER]; > > > > static spinlock_t shrinker_lock; > > static struct list_head shrinker_list; > > static struct shrinker mm_shrinker; > > > > +static unsigned int ttm_pool_orders[] = {TTM_MAX_ORDER, 0, 0}; > > + > > /* Allocate pages of size 1 << order with the given gfp_flags */ > > static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, > > gfp_t gfp_flags, > > unsigned int order) > > @@ -400,6 +416,17 @@ static void __ttm_pool_free(struct ttm_pool > > *pool, struct ttm_tt *tt, > > } > > } > > > > +static unsigned int ttm_pool_select_order(unsigned int order, > > pgoff_t num_pages) > > +{ > > + unsigned int *cur_order = ttm_pool_orders; > > + > > + order = min_t(unsigned int, __fls(num_pages), order); > > + while (order < *cur_order) > > + ++cur_order; > > + > > + return *cur_order; > > +} > > + > > /** > > * ttm_pool_alloc - Fill a ttm_tt object > > * > > @@ -439,9 +466,8 @@ int ttm_pool_alloc(struct ttm_pool *pool, > > struct ttm_tt *tt, > > else > > gfp_flags |= GFP_HIGHUSER; > > > > - for (order = min_t(unsigned int, MAX_ORDER - 1, > > __fls(num_pages)); > > - num_pages; > > - order = min_t(unsigned int, order, __fls(num_pages))) > > { > > + order = ttm_pool_select_order(ttm_pool_orders[0], > > num_pages); > > + for (; num_pages; order = ttm_pool_select_order(order, > > num_pages)) { > > struct ttm_pool_type *pt; > > > > page_caching = tt->caching; > > @@ -558,7 +584,7 @@ void ttm_pool_init(struct ttm_pool *pool, > > struct device *dev, > > > > if (use_dma_alloc) { > > for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) > > - for (j = 0; j < MAX_ORDER; ++j) > > + for (j = 0; j < TTM_DIM_ORDER; ++j) > > ttm_pool_type_init(&pool- > > >caching[i].orders[j], > > pool, i, j); > > } > > @@ -578,7 +604,7 @@ void ttm_pool_fini(struct ttm_pool *pool) > > > > if (pool->use_dma_alloc) { > > for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) > > - for (j = 0; j < MAX_ORDER; ++j) > > + for (j = 0; j < TTM_DIM_ORDER; ++j) > > ttm_pool_type_fini(&pool- > > >caching[i].orders[j]); > > } > > > > @@ -632,7 +658,7 @@ static void ttm_pool_debugfs_header(struct > > seq_file *m) > > unsigned int i; > > > > seq_puts(m, "\t "); > > - for (i = 0; i < MAX_ORDER; ++i) > > + for (i = 0; i < TTM_DIM_ORDER; ++i) > > seq_printf(m, " ---%2u---", i); > > seq_puts(m, "\n"); > > } > > @@ -643,7 +669,7 @@ static void ttm_pool_debugfs_orders(struct > > ttm_pool_type *pt, > > { > > unsigned int i; > > > > - for (i = 0; i < MAX_ORDER; ++i) > > + for (i = 0; i < TTM_DIM_ORDER; ++i) > > seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); > > seq_puts(m, "\n"); > > } > > @@ -749,10 +775,16 @@ int ttm_pool_mgr_init(unsigned long > > num_pages) > > if (!page_pool_size) > > page_pool_size = num_pages; > > > > + if (TTM_64K_ORDER < TTM_MAX_ORDER) > > + ttm_pool_orders[1] = TTM_64K_ORDER; > > + > > + pr_debug("Used orders are %u %u %u\n", ttm_pool_orders[0], > > + ttm_pool_orders[1], ttm_pool_orders[2]); > > + > > spin_lock_init(&shrinker_lock); > > INIT_LIST_HEAD(&shrinker_list); > > > > - for (i = 0; i < MAX_ORDER; ++i) { > > + for (i = 0; i < TTM_DIM_ORDER; ++i) { > > ttm_pool_type_init(&global_write_combined[i], NULL, > > ttm_write_combined, i); > > ttm_pool_type_init(&global_uncached[i], NULL, > > ttm_uncached, i); > > @@ -785,7 +817,7 @@ void ttm_pool_mgr_fini(void) > > { > > unsigned int i; > > > > - for (i = 0; i < MAX_ORDER; ++i) { > > + for (i = 0; i < TTM_DIM_ORDER; ++i) { > > ttm_pool_type_fini(&global_write_combined[i]); > > ttm_pool_type_fini(&global_uncached[i]); > > >