On Mon, 1 Jul 2024 20:15:46 +0300 Usama Arif <usamaarif642@xxxxxxxxx> wrote: > Changes from v7 (Johannes): > - Give overview and explain how locking works in zeromap in comments > - Add comment for why last word is checked first when checking if > folio is zero-filled > - Merge is_folio_zero_filled and is_folio_page_zero_filled into > 1 function. > - Use folio_zero_range to fill a folio with zero at readtime. > - Put swap_zeromap_folio_clear in an else branch (even if checkpatch > gives warning) and add comment to make it explicitly clear that it > needs to happen if folio is not zero filled. > - add missing kvfree for zeromap incase swapon fails. I queued the below as a delta against what was in mm-unstable. Can we please get this nailed down? It has been nearly three weeks and this patch is getting in the way of the "mm/zsmalloc: change back to per-size_class lock" series, at least. include/linux/swap.h | 2 - mm/page_io.c | 64 +++++++++++++++++++++-------------------- mm/swapfile.c | 9 +++-- 3 files changed, 40 insertions(+), 35 deletions(-) --- a/include/linux/swap.h~mm-store-zero-pages-to-be-swapped-out-in-a-bitmap-v8 +++ a/include/linux/swap.h @@ -299,7 +299,7 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ - unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct swap_cluster_list free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ --- a/mm/page_io.c~mm-store-zero-pages-to-be-swapped-out-in-a-bitmap-v8 +++ a/mm/page_io.c @@ -172,42 +172,34 @@ bad_bmap: goto out; } -static bool is_folio_page_zero_filled(struct folio *folio, int i) -{ - unsigned long *data; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; - bool ret = false; - - data = kmap_local_folio(folio, i * PAGE_SIZE); - if (data[last_pos]) - goto out; - for (pos = 0; pos < PAGE_SIZE / sizeof(*data); pos++) { - if (data[pos]) - goto out; - } - ret = true; -out: - kunmap_local(data); - return ret; -} - static bool is_folio_zero_filled(struct folio *folio) { + unsigned int pos, last_pos; + unsigned long *data; unsigned int i; + last_pos = PAGE_SIZE / sizeof(*data) - 1; for (i = 0; i < folio_nr_pages(folio); i++) { - if (!is_folio_page_zero_filled(folio, i)) + data = kmap_local_folio(folio, i * PAGE_SIZE); + /* + * Check last word first, incase the page is zero-filled at + * the start and has non-zero data at the end, which is common + * in real-world workloads. + */ + if (data[last_pos]) { + kunmap_local(data); return false; + } + for (pos = 0; pos < last_pos; pos++) { + if (data[pos]) { + kunmap_local(data); + return false; + } + } + kunmap_local(data); } - return true; -} -static void folio_zero_fill(struct folio *folio) -{ - unsigned int i; - - for (i = 0; i < folio_nr_pages(folio); i++) - clear_highpage(folio_page(folio, i)); + return true; } static void swap_zeromap_folio_set(struct folio *folio) @@ -278,12 +270,24 @@ int swap_writepage(struct page *page, st return ret; } + /* + * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. + * The bits in zeromap are protected by the locked swapcache folio + * and atomic updates are used to protect against read-modify-write + * corruption due to other zero swap entries seeing concurrent updates. + */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); folio_unlock(folio); return 0; + } else { + /* + * Clear bits this folio occupies in the zeromap to prevent + * zero data being read in from any previous zero writes that + * occupied the same swap entries. + */ + swap_zeromap_folio_clear(folio); } - swap_zeromap_folio_clear(folio); if (zswap_store(folio)) { folio_unlock(folio); return 0; @@ -528,7 +532,7 @@ static bool swap_read_folio_zeromap(stru if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) return true; - folio_zero_fill(folio); + folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; } --- a/mm/swapfile.c~mm-store-zero-pages-to-be-swapped-out-in-a-bitmap-v8 +++ a/mm/swapfile.c @@ -3181,11 +3181,11 @@ SYSCALL_DEFINE2(swapon, const char __use } /* - * Use kvmalloc_array instead of bitmap_zalloc as the allocation order - * might be above MAX_PAGE_ORDER incase of a large swap file. + * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might + * be above MAX_PAGE_ORDER incase of a large swap file. */ - p->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), - sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); + p->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); if (!p->zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; @@ -3345,6 +3345,7 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + kvfree(p->zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); _