On Tue, Oct 1, 2024 at 6:20 PM Nhat Pham <nphamcs@xxxxxxxxx> wrote: > > The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a > ("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry > belongs to shmem during swapoff. > > However, swapoff has since been rewritten in the commit b56a2d8af914 > ("mm: rid swapoff of quadratic complexity"). Now having swap count == > SWAP_MAP_SHMEM value is basically the same as having swap count == 1, > and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only > difference of note is that swap_shmem_alloc() does not check for > -ENOMEM returned from __swap_duplicate(), but it is OK because shmem > never re-duplicates any swap entry it owns. This will stil be safe if we > use (batched) swap_duplicate() instead. > > This commit adds swap_duplicate_nr(), the batched variant of > swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the > associated swap_shmem_alloc() helper to simplify the state machine (both > mentally and in terms of actual code). We will also have an extra > state/special value that can be repurposed (for swap entries that never > gets re-duplicated). > > Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx> > --- > include/linux/swap.h | 16 ++++++++-------- > mm/shmem.c | 2 +- > mm/swapfile.c | 41 +++++++++++++++++++++-------------------- > 3 files changed, 30 insertions(+), 29 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index ca533b478c21..017f3c03ff7a 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -232,7 +232,6 @@ enum { > /* Special value in first swap_map */ > #define SWAP_MAP_MAX 0x3e /* Max count */ > #define SWAP_MAP_BAD 0x3f /* Note page is bad */ > -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ > > /* Special value in each swap_map continuation */ > #define SWAP_CONT_MAX 0x7f /* Max count */ > @@ -482,8 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); > extern swp_entry_t get_swap_page_of_type(int); > extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); > extern int add_swap_count_continuation(swp_entry_t, gfp_t); > -extern void swap_shmem_alloc(swp_entry_t, int); > -extern int swap_duplicate(swp_entry_t); > +extern int swap_duplicate_nr(swp_entry_t, int); > extern int swapcache_prepare(swp_entry_t entry, int nr); > extern void swap_free_nr(swp_entry_t entry, int nr_pages); > extern void swapcache_free_entries(swp_entry_t *entries, int n); > @@ -549,11 +547,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) > return 0; > } > > -static inline void swap_shmem_alloc(swp_entry_t swp, int nr) > -{ > -} > - > -static inline int swap_duplicate(swp_entry_t swp) > +static inline int swap_duplicate_nr(swp_entry_t swp, int nr) > { > return 0; > } > @@ -606,6 +600,12 @@ static inline int add_swap_extent(struct swap_info_struct *sis, > } > #endif /* CONFIG_SWAP */ > > +static inline int swap_duplicate(swp_entry_t entry) > +{ > + return swap_duplicate_nr(entry, 1); > +} > + > + Nit: extra blank line. > static inline void free_swap_and_cache(swp_entry_t entry) > { > free_swap_and_cache_nr(entry, 1); > diff --git a/mm/shmem.c b/mm/shmem.c > index 0613421e09e7..e3f72f99be32 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1561,7 +1561,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) > __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, > NULL) == 0) { > shmem_recalc_inode(inode, 0, nr_pages); > - swap_shmem_alloc(swap, nr_pages); > + swap_duplicate_nr(swap, nr_pages); > shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); > > mutex_unlock(&shmem_swaplist_mutex); > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 0cded32414a1..9bb94e618914 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -1381,12 +1381,6 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, > if (usage == SWAP_HAS_CACHE) { > VM_BUG_ON(!has_cache); > has_cache = 0; > - } else if (count == SWAP_MAP_SHMEM) { > - /* > - * Or we could insist on shmem.c using a special > - * swap_shmem_free() and free_shmem_swap_and_cache()... > - */ > - count = 0; > } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { > if (count == COUNT_CONTINUED) { > if (swap_count_continued(si, offset, count)) > @@ -3626,7 +3620,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) > > offset = swp_offset(entry); > VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); > - VM_WARN_ON(usage == 1 && nr > 1); > ci = lock_cluster_or_swap_info(si, offset); > > err = 0; > @@ -3652,6 +3645,13 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) > err = -EEXIST; > } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { > err = -EINVAL; > + } else { > + /* > + * The only swap_duplicate_nr() caller that passes nr > 1 is shmem, > + * who never re-duplicates any swap entry it owns. So this should nit: I think "which" is the right word here, but I am not a native speaker :) > + * not happen. > + */ > + VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX); Why not return an error in this case? I think we should add recovery for bugs when it's possible and simple, which I believe is the case here. In shmem_writepage() we can add a WARN if swap_duplicate_nr() fails, or propagate an error to the caller as well (perhaps this belongs in a separate patch that does this for swap_shmem_alloc() first). Sorry if I am being paranoid here, please let me know if this is the case. > } > > if (err) > @@ -3686,27 +3686,28 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) > return err; > } > > -/* > - * Help swapoff by noting that swap entry belongs to shmem/tmpfs > - * (in which case its reference count is never incremented). > - */ > -void swap_shmem_alloc(swp_entry_t entry, int nr) > -{ > - __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); > -} > - > -/* > - * Increase reference count of swap entry by 1. > +/** > + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries > + * by 1. Can we avoid the line break by using "refcount" instead of "reference count"? > + * > + * @entry: first swap entry from which we want to increase the refcount. > + * @nr: Number of entries in range. > + * > * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required > * but could not be atomically allocated. Returns 0, just as if it succeeded, > * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which > * might occur if a page table entry has got corrupted. > + * > + * Note that we are currently not handling the case where nr > 1 and we need to > + * add swap count continuation. This is OK, because no such user exists - shmem > + * is the only user that can pass nr > 1, and it never re-duplicates any swap > + * entry it owns. Do we need this comment when we have the WARN + comment in __swap_duplicate()? > */ > -int swap_duplicate(swp_entry_t entry) > +int swap_duplicate_nr(swp_entry_t entry, int nr) > { > int err = 0; > > - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) > + while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM) > err = add_swap_count_continuation(entry, GFP_ATOMIC); > return err; > } > -- > 2.43.5