Re: [PATCH v5 6/9] mm: swap: allow cache reclaim to skip slot cache

Kairui Song <ryncsn@xxxxxxxxx> · Sat, 3 Aug 2024 20:18:57 +0800

On Sat, Aug 3, 2024 at 6:39 PM Barry Song <baohua@xxxxxxxxxx> wrote:
>
> On Wed, Jul 31, 2024 at 2:49 PM <chrisl@xxxxxxxxxx> wrote:
> >
> > From: Kairui Song <kasong@xxxxxxxxxxx>
> >
> > Currently we free the reclaimed slots through slot cache even
> > if the slot is required to be empty immediately. As a result
> > the reclaim caller will see the slot still occupied even after a
> > successful reclaim, and need to keep reclaiming until slot cache
> > get flushed. This caused ineffective or over reclaim when SWAP is
> > under stress.
> >
> > So introduce a new flag allowing the slot to be emptied bypassing
> > the slot cache.
> >
> > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> > ---
> >  mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 109 insertions(+), 43 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 9b63b2262cc2..4c0fc0409d3c 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -53,8 +53,15 @@
> >  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> >                                  unsigned char);
> >  static void free_swap_count_continuations(struct swap_info_struct *);
> > +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
> > +                                 unsigned int nr_pages);
> >  static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
> >                              unsigned int nr_entries);
> > +static bool folio_swapcache_freeable(struct folio *folio);
> > +static struct swap_cluster_info *lock_cluster_or_swap_info(
> > +               struct swap_info_struct *si, unsigned long offset);
> > +static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> > +                                       struct swap_cluster_info *ci);
> >
> >  static DEFINE_SPINLOCK(swap_lock);
> >  static unsigned int nr_swapfiles;
> > @@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent)
> >   * corresponding page
> >   */
> >  #define TTRS_UNMAPPED          0x2
> > -/* Reclaim the swap entry if swap is getting full*/
> > +/* Reclaim the swap entry if swap is getting full */
> >  #define TTRS_FULL              0x4
> > +/* Reclaim directly, bypass the slot cache and don't touch device lock */
> > +#define TTRS_DIRECT            0x8
> > +
> > +static bool swap_is_has_cache(struct swap_info_struct *si,
> > +                             unsigned long offset, int nr_pages)
> > +{
> > +       unsigned char *map = si->swap_map + offset;
> > +       unsigned char *map_end = map + nr_pages;
> > +
> > +       do {
> > +               VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
> > +               if (*map != SWAP_HAS_CACHE)
> > +                       return false;
> > +       } while (++map < map_end);
> > +
> > +       return true;
> > +}
> >
> >  /*
> >   * returns number of pages in the folio that backs the swap entry. If positive,
> > @@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >                                  unsigned long offset, unsigned long flags)
> >  {
> >         swp_entry_t entry = swp_entry(si->type, offset);
> > +       struct address_space *address_space = swap_address_space(entry);
> > +       struct swap_cluster_info *ci;
> >         struct folio *folio;
> > -       int ret = 0;
> > +       int ret, nr_pages;
> > +       bool need_reclaim;
> >
> > -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> > +       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> >         if (IS_ERR(folio))
> >                 return 0;
> > +
> > +       /* offset could point to the middle of a large folio */
> > +       entry = folio->swap;
> > +       offset = swp_offset(entry);
> > +       nr_pages = folio_nr_pages(folio);
> > +       ret = -nr_pages;
> > +
> >         /*
> >          * When this function is called from scan_swap_map_slots() and it's
> >          * called by vmscan.c at reclaiming folios. So we hold a folio lock
> > @@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >          * case and you should use folio_free_swap() with explicit folio_lock()
> >          * in usual operations.
> >          */
> > -       if (folio_trylock(folio)) {
> > -               if ((flags & TTRS_ANYWAY) ||
> > -                   ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > -                   ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
> > -                       ret = folio_free_swap(folio);
> > -               folio_unlock(folio);
> > +       if (!folio_trylock(folio))
> > +               goto out;
> > +
> > +       need_reclaim = ((flags & TTRS_ANYWAY) ||
> > +                       ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
> > +                       ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
> > +       if (!need_reclaim || !folio_swapcache_freeable(folio))
> > +               goto out_unlock;
> > +
> > +       /*
> > +        * It's safe to delete the folio from swap cache only if the folio's
> > +        * swap_map is HAS_CACHE only, which means the slots have no page table
> > +        * reference or pending writeback, and can't be allocated to others.
> > +        */
> > +       ci = lock_cluster_or_swap_info(si, offset);
> > +       need_reclaim = swap_is_has_cache(si, offset, nr_pages);
> > +       unlock_cluster_or_swap_info(si, ci);
> > +       if (!need_reclaim)
> > +               goto out_unlock;
> > +
> > +       if (!(flags & TTRS_DIRECT)) {
> > +               /* Free through slot cache */
> > +               delete_from_swap_cache(folio);
> > +               folio_set_dirty(folio);
> > +               ret = nr_pages;
> > +               goto out_unlock;
> >         }
> > -       ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
> > +
> > +       xa_lock_irq(&address_space->i_pages);
> > +       __delete_from_swap_cache(folio, entry, NULL);
> > +       xa_unlock_irq(&address_space->i_pages);
> > +       folio_ref_sub(folio, nr_pages);
> > +       folio_set_dirty(folio);
> > +
> > +       spin_lock(&si->lock);
> > +       /* Only sinple page folio can be backed by zswap */
> > +       if (!nr_pages)
> > +               zswap_invalidate(entry);
>
> I am trying to figure out if I am mad :-)  Does nr_pages == 0 means single
> page folio?
>

Hi Barry

I'm sorry, this should be nr_pages == 1, I messed up order and nr, as
zswap only works for single page folios.