The patch titled Subject: mm/filemap: optimize filemap folio adding has been added to the -mm mm-unstable branch. Its filename is mm-filemap-optimize-filemap-folio-adding.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-filemap-optimize-filemap-folio-adding.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Kairui Song <kasong@xxxxxxxxxxx> Subject: mm/filemap: optimize filemap folio adding Date: Tue, 16 Apr 2024 01:18:56 +0800 Instead of doing multiple tree walks, do one optimism range check with lock hold, and exit if raced with another insertion. If a shadow exists, check it with a new xas_get_order helper before releasing the lock to avoid redundant tree walks for getting its order. Drop the lock and do the allocation only if a split is needed. In the best case, it only need to walk the tree once. If it needs to alloc and split, 3 walks are issued (One for first ranged conflict check and order retrieving, one for the second check after allocation, one for the insert after split). Testing with 4K pages, in an 8G cgroup, with 16G brd as block device: echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap --rw=randread --time_based \ --ramp_time=30s --runtime=5m --group_reporting Before: bw ( MiB/s): min= 1027, max= 3520, per=100.00%, avg=2445.02, stdev=18.90, samples=8691 iops : min=263001, max=901288, avg=625924.36, stdev=4837.28, samples=8691 After (+7.3%): bw ( MiB/s): min= 493, max= 3947, per=100.00%, avg=2625.56, stdev=25.74, samples=8651 iops : min=126454, max=1010681, avg=672142.61, stdev=6590.48, samples=8651 Test result with THP (do a THP randread then switch to 4K page in hope it issues a lot of splitting): echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap -thp=1 --readonly \ --rw=randread --time_based --ramp_time=30s --runtime=10m \ --group_reporting fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap \ --rw=randread --time_based --runtime=5s --group_reporting Before: bw ( KiB/s): min= 4141, max=14202, per=100.00%, avg=7935.51, stdev=96.85, samples=18976 iops : min= 1029, max= 3548, avg=1979.52, stdev=24.23, samples=18976· READ: bw=4545B/s (4545B/s), 4545B/s-4545B/s (4545B/s-4545B/s), io=64.0KiB (65.5kB), run=14419-14419msec After (+12.5%): bw ( KiB/s): min= 4611, max=15370, per=100.00%, avg=8928.74, stdev=105.17, samples=19146 iops : min= 1151, max= 3842, avg=2231.27, stdev=26.29, samples=19146 READ: bw=4635B/s (4635B/s), 4635B/s-4635B/s (4635B/s-4635B/s), io=64.0KiB (65.5kB), run=14137-14137msec The performance is better for both 4K (+7.5%) and THP (+12.5%) cached read. Link: https://lkml.kernel.org/r/20240415171857.19244-5-ryncsn@xxxxxxxxx Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> Cc: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- lib/test_xarray.c | 59 ++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 56 ++++++++++++++++++++++++++++++----------- 2 files changed, 100 insertions(+), 15 deletions(-) --- a/lib/test_xarray.c~mm-filemap-optimize-filemap-folio-adding +++ a/lib/test_xarray.c @@ -2017,6 +2017,64 @@ static noinline void check_xas_get_order } } +static noinline void check_xas_conflict_get_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + void *entry; + int only_once; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j, k; + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xas_set_order(&xas, i << order, order); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(i)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + /* + * Ensure xas_get_order works with xas_for_each_conflict. + */ + j = i << order; + for (k = 0; k < order; k++) { + only_once = 0; + xas_set_order(&xas, j + (1 << k), k); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + if (order < max_order - 1) { + only_once = 0; + xas_set_order(&xas, (i & ~1UL) << order, order + 1); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + xas_set_order(&xas, i << order, order); + xas_lock(&xas); + xas_store(&xas, NULL); + xas_unlock(&xas); + } + } +} + + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -2069,6 +2127,7 @@ static int xarray_checks(void) check_multi_store_advanced(&array); check_get_order(&array); check_xas_get_order(&array); + check_xas_conflict_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); --- a/mm/filemap.c~mm-filemap-optimize-filemap-folio-adding +++ a/mm/filemap.c @@ -852,7 +852,9 @@ noinline int __filemap_add_folio(struct struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - bool huge = folio_test_hugetlb(folio); + void *alloced_shadow = NULL; + int alloced_order = 0; + bool huge; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -861,6 +863,7 @@ noinline int __filemap_add_folio(struct VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); xas_set_order(&xas, index, folio_order(folio)); + huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; @@ -868,16 +871,10 @@ noinline int __filemap_add_folio(struct folio->mapping = mapping; folio->index = xas.xa_index; - do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1, split_order = 0; void *entry, *old = NULL; - if (order > folio_order(folio)) { - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); - if (xas_error(&xas)) - goto error; - } xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -885,19 +882,33 @@ noinline int __filemap_add_folio(struct xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); + } + + /* entry may have changed before we re-acquire the lock */ + if (alloced_order && (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; } if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); + if (!alloced_order) { + split_order = order; + goto unlock; + } xas_split(&xas, old, order); xas_reset(&xas); } + if (shadowp) + *shadowp = old; } xas_store(&xas, folio); @@ -913,9 +924,24 @@ noinline int __filemap_add_folio(struct __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } if (xas_error(&xas)) goto error; _ Patches currently in -mm which might be from kasong@xxxxxxxxxxx are mm-filemap-return-early-if-failed-to-allocate-memory-for-split.patch mm-filemap-clean-up-hugetlb-exclusion-code.patch lib-xarray-introduce-a-new-helper-xas_get_order.patch mm-filemap-optimize-filemap-folio-adding.patch