Kairui Song <ryncsn@xxxxxxxxx> writes: > From: Kairui Song <kasong@xxxxxxxxxxx> > > Currently, shmem uses cluster readahead for all swap backends. Cluster > readahead is not a good solution for ramdisk based device (ZRAM) at all. > > After switching to the new helper, most benchmarks showed a good result: > > - Single file sequence read: > perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192 > (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit) > Before: 22.248 +- 0.549 > After: 22.021 +- 0.684 (-1.1%) > > - Random read stress test: > fio -name=tmpfs --numjobs=16 --directory=/tmpfs \ > --size=256m --ioengine=mmap --rw=randread --random_distribution=random \ > --time_based --ramp_time=1m --runtime=5m --group_reporting > (using brd as swap, 2G memcg limit) > > Before: 1818MiB/s > After: 1888MiB/s (+3.85%) > > - Zipf biased random read stress test: > fio -name=tmpfs --numjobs=16 --directory=/tmpfs \ > --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \ > --time_based --ramp_time=1m --runtime=5m --group_reporting > (using brd as swap, 2G memcg limit) > > Before: 31.1GiB/s > After: 32.3GiB/s (+3.86%) > > So cluster readahead doesn't help much even for single sequence read, > and for random stress test, the performance is better without it. > > Considering both memory and swap device will get more fragmented > slowly, and commonly used ZRAM consumes much more CPU than plain > ramdisk, false readahead could occur more frequently and waste > more CPU. Direct SWAP is cheaper, so use the new helper and skip > read ahead for SWP_SYNCHRONOUS_IO device. It's good to take advantage of swap_direct (no readahead). I also hopes we can take advantage of VMA based swapin if shmem is accessed via mmap. That appears possible. -- Best Regards, Huang, Ying > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > mm/shmem.c | 67 +++++++++++++++++++++++++------------------------ > mm/swap.h | 9 ------- > mm/swap_state.c | 11 ++++++-- > 3 files changed, 43 insertions(+), 44 deletions(-) > > diff --git a/mm/shmem.c b/mm/shmem.c > index 9da9f7a0e620..3c0729fe934d 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1564,20 +1564,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) > static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, > pgoff_t index, unsigned int order, pgoff_t *ilx); > > -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, > - struct shmem_inode_info *info, pgoff_t index) > -{ > - struct mempolicy *mpol; > - pgoff_t ilx; > - struct folio *folio; > - > - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); > - folio = swap_cluster_readahead(swap, gfp, mpol, ilx); > - mpol_cond_put(mpol); > - > - return folio; > -} > - > /* > * Make sure huge_gfp is always more limited than limit_gfp. > * Some of the flags set permissions, while others set limitations. > @@ -1851,9 +1837,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > { > struct address_space *mapping = inode->i_mapping; > struct shmem_inode_info *info = SHMEM_I(inode); > + enum swap_cache_result cache_result; > struct swap_info_struct *si; > struct folio *folio = NULL; > + struct mempolicy *mpol; > swp_entry_t swap; > + pgoff_t ilx; > int error; > > VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); > @@ -1871,36 +1860,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > return -EINVAL; > } > > - /* Look it up and read it in.. */ > - folio = swap_cache_get_folio(swap, NULL, 0, NULL); > + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); > + folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result); > + mpol_cond_put(mpol); > + > if (!folio) { > - /* Or update major stats only when swapin succeeds?? */ > + error = -ENOMEM; > + goto failed; > + } > + if (cache_result != SWAP_CACHE_HIT) { > if (fault_type) { > *fault_type |= VM_FAULT_MAJOR; > count_vm_event(PGMAJFAULT); > count_memcg_event_mm(fault_mm, PGMAJFAULT); > } > - /* Here we actually start the io */ > - folio = shmem_swapin_cluster(swap, gfp, info, index); > - if (!folio) { > - error = -ENOMEM; > - goto failed; > - } > } > > /* We have to do this with folio locked to prevent races */ > folio_lock(folio); > - if (!folio_test_swapcache(folio) || > - folio->swap.val != swap.val || > - !shmem_confirm_swap(mapping, index, swap)) { > + if (cache_result != SWAP_CACHE_BYPASS) { > + /* With cache bypass, folio is new allocated, sync, and not in cache */ > + if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) { > + error = -EEXIST; > + goto unlock; > + } > + if (!folio_test_uptodate(folio)) { > + error = -EIO; > + goto failed; > + } > + folio_wait_writeback(folio); > + } > + if (!shmem_confirm_swap(mapping, index, swap)) { > error = -EEXIST; > goto unlock; > } > - if (!folio_test_uptodate(folio)) { > - error = -EIO; > - goto failed; > - } > - folio_wait_writeback(folio); > > /* > * Some architectures may have to restore extra metadata to the > @@ -1908,12 +1901,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > */ > arch_swap_restore(swap, folio); > > - if (shmem_should_replace_folio(folio, gfp)) { > + /* With cache bypass, folio is new allocated and always respect gfp flags */ > + if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) { > error = shmem_replace_folio(&folio, gfp, info, index); > if (error) > goto failed; > } > > + /* > + * The expected value checking below should be enough to ensure > + * only one up-to-date swapin success. swap_free() is called after > + * this, so the entry can't be reused. As long as the mapping still > + * has the old entry value, it's never swapped in or modified. > + */ > error = shmem_add_to_page_cache(folio, mapping, index, > swp_to_radix_entry(swap), gfp); > if (error) > @@ -1924,7 +1924,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > if (sgp == SGP_WRITE) > folio_mark_accessed(folio); > > - delete_from_swap_cache(folio); > + if (cache_result != SWAP_CACHE_BYPASS) > + delete_from_swap_cache(folio); > folio_mark_dirty(folio); > swap_free(swap); > put_swap_device(si); > diff --git a/mm/swap.h b/mm/swap.h > index 8f790a67b948..20f4048c971c 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -57,9 +57,6 @@ void __delete_from_swap_cache(struct folio *folio, > void delete_from_swap_cache(struct folio *folio); > void clear_shadow_from_swap_cache(int type, unsigned long begin, > unsigned long end); > -struct folio *swap_cache_get_folio(swp_entry_t entry, > - struct vm_area_struct *vma, unsigned long addr, > - void **shadowp); > struct folio *filemap_get_incore_folio(struct address_space *mapping, > pgoff_t index); > > @@ -123,12 +120,6 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) > return 0; > } > > -static inline struct folio *swap_cache_get_folio(swp_entry_t entry, > - struct vm_area_struct *vma, unsigned long addr) > -{ > - return NULL; > -} > - > static inline > struct folio *filemap_get_incore_folio(struct address_space *mapping, > pgoff_t index) > diff --git a/mm/swap_state.c b/mm/swap_state.c > index 3edf4b63158d..10eec68475dd 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -318,7 +318,14 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) > > static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry) > { > - return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1; > + int count; > + > + if (!data_race(si->flags & SWP_SYNCHRONOUS_IO)) > + return false; > + > + count = __swap_count(entry); > + > + return (count == 1 || count == SWAP_MAP_SHMEM); > } > > static inline bool swap_use_vma_readahead(void) > @@ -334,7 +341,7 @@ static inline bool swap_use_vma_readahead(void) > * > * Caller must lock the swap device or hold a reference to keep it valid. > */ > -struct folio *swap_cache_get_folio(swp_entry_t entry, > +static struct folio *swap_cache_get_folio(swp_entry_t entry, > struct vm_area_struct *vma, unsigned long addr, void **shadowp) > { > struct folio *folio;