Re: [PATCH v3 06/13] mm, swap: clean up plist removal and adding

Kairui Song <ryncsn@xxxxxxxxx> · Fri, 3 Jan 2025 16:07:07 +0800

On Thu, Jan 2, 2025 at 4:59 PM Baoquan He <bhe@xxxxxxxxxx> wrote:
>
> Hi Kairui,
>
> On 12/31/24 at 01:46am, Kairui Song wrote:
> ......snip...
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 7963a0c646a4..e6e58cfb5178 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
> >       return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
> >  }
>
> I am reading swap code, while at it, I am going through this patchset
> too. Have some nitpick, please see below inline comments.

Thanks!

> >
> > +/*
> > + * Use the second highest bit of inuse_pages counter as the indicator
> > + * of if one swap device is on the available plist, so the atomic can
>       ~~ redundant?
> > + * still be updated arithmetic while having special data embedded.
>                        ~~~~~~~~~~ typo, arithmetically?
> > + *
> > + * inuse_pages counter is the only thing indicating if a device should
> > + * be on avail_lists or not (except swapon / swapoff). By embedding the
> > + * on-list bit in the atomic counter, updates no longer need any lock
>       ~~~ off-list?

Ah, right, some typos, will fix these.

> > + * to check the list status.
> > + *
> > + * This bit will be set if the device is not on the plist and not
> > + * usable, will be cleared if the device is on the plist.
> > + */
> > +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
> > +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
> > +static long swap_usage_in_pages(struct swap_info_struct *si)
> > +{
> > +     return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
> > +}
> > +
> >  /* Reclaim the swap entry anyway if possible */
> >  #define TTRS_ANYWAY          0x1
> >  /*
> > @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
> >       int nr_reclaim;
> >
> >       if (force)
> > -             to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
> > +             to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
> >
> >       while (!list_empty(&si->full_clusters)) {
> >               ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
> > @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >       return found;
> >  }
> >
> > -static void __del_from_avail_list(struct swap_info_struct *si)
> > +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
>        Seems it just says the opposite. The off-list bit is set in
>        this function.

Right, the comments are opposite... will fix them.

> > +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >  {
> >       int nid;
> > +     unsigned long pages;
> > +
> > +     spin_lock(&swap_avail_lock);
> > +
> > +     if (swapoff) {
> > +             /*
> > +              * Forcefully remove it. Clear the SWP_WRITEOK flags for
> > +              * swapoff here so it's synchronized by both si->lock and
> > +              * swap_avail_lock, to ensure the result can be seen by
> > +              * add_to_avail_list.
> > +              */
> > +             lockdep_assert_held(&si->lock);
> > +             si->flags &= ~SWP_WRITEOK;
> > +             atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> > +     } else {
> > +             /*
> > +              * If not called by swapoff, take it off-list only if it's
> > +              * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
> > +              * si->inuse_pages == pages), any concurrent slot freeing,
> > +              * or device already removed from plist by someone else
> > +              * will make this return false.
> > +              */
> > +             pages = si->pages;
> > +             if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> > +                                          pages | SWAP_USAGE_OFFLIST_BIT))
> > +                     goto skip;
> > +     }
> >
> > -     assert_spin_locked(&si->lock);
> >       for_each_node(nid)
> >               plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > +
> > +skip:
> > +     spin_unlock(&swap_avail_lock);
> >  }
> >
> > -static void del_from_avail_list(struct swap_info_struct *si)
> > +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
>
>   Ditto.
>
> > +static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >  {
> > +     int nid;
> > +     long val;
> > +     unsigned long pages;
> > +
> >       spin_lock(&swap_avail_lock);
> > -     __del_from_avail_list(si);
> > +
> > +     /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
> > +     if (swapon) {
> > +             lockdep_assert_held(&si->lock);
> > +             si->flags |= SWP_WRITEOK;
> > +     } else {
> > +             if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
> > +                     goto skip;
> > +     }
> > +
> > +     if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
> > +             goto skip;
> > +
> > +     val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> > +
> > +     /*
> > +      * When device is full and device is on the plist, only one updater will
> > +      * see (inuse_pages == si->pages) and will call del_from_avail_list. If
> > +      * that updater happen to be here, just skip adding.
> > +      */
> > +     pages = si->pages;
> > +     if (val == pages) {
> > +             /* Just like the cmpxchg in del_from_avail_list */
> > +             if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> > +                                         pages | SWAP_USAGE_OFFLIST_BIT))
> > +                     goto skip;
> > +     }
> > +
> > +     for_each_node(nid)
> > +             plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > +
> > +skip:
> >       spin_unlock(&swap_avail_lock);
> >  }
> >
> > -static void swap_range_alloc(struct swap_info_struct *si,
> > -                          unsigned int nr_entries)
> > +/*
> > + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
>
> Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub()
> is better, because it mixes with the usage of si->swap_map[offset].
> Anyway, not strong opinion.
>
> > + * within each cluster, so the total contribution to the global counter should
> > + * always be positive and cannot exceed the total number of usable slots.
> > + */
> > +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
> >  {
> > -     WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
> > -     if (si->inuse_pages == si->pages) {
> > -             del_from_avail_list(si);
> > +     long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
> >
> > -             if (si->cluster_info && vm_swap_full())
> > -                     schedule_work(&si->reclaim_work);
> > +     /*
> > +      * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
> > +      * remove it from the plist.
> > +      */
> > +     if (unlikely(val == si->pages)) {
> > +             del_from_avail_list(si, false);
> > +             return true;
> >       }
> > +
> > +     return false;
> >  }
> >
> > -static void add_to_avail_list(struct swap_info_struct *si)
> > +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
> >  {
> > -     int nid;
> > +     long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
> >
> > -     spin_lock(&swap_avail_lock);
> > -     for_each_node(nid)
> > -             plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > -     spin_unlock(&swap_avail_lock);
> > +     /*
> > +      * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
> > +      * remove it from the plist.
> > +      */
> > +     if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
> > +             add_to_avail_list(si, false);
> > +}
> > +
> > +static void swap_range_alloc(struct swap_info_struct *si,
> > +                          unsigned int nr_entries)
> > +{
> > +     if (swap_usage_add(si, nr_entries)) {
> > +             if (si->cluster_info && vm_swap_full())
>
> We may not need check si->cluster_info here since it always exists now.

Good catch, it can be dropped indeed as an optimization, one previous
patch in this series is supposed to drop them all, I think I forgot
this one.

>
> > +                     schedule_work(&si->reclaim_work);
> > +     }
> >  }
> >
> >  static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> >       for (i = 0; i < nr_entries; i++)
> >               clear_bit(offset + i, si->zeromap);
> >
> > -     if (si->inuse_pages == si->pages)
> > -             add_to_avail_list(si);
> >       if (si->flags & SWP_BLKDEV)
> >               swap_slot_free_notify =
> >                       si->bdev->bd_disk->fops->swap_slot_free_notify;
> > @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> >        */
> >       smp_wmb();
> >       atomic_long_add(nr_entries, &nr_swap_pages);
> > -     WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
> > +     swap_usage_sub(si, nr_entries);
> >  }
> >
> >  static int cluster_alloc_swap(struct swap_info_struct *si,
> > @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> >               plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> >               spin_unlock(&swap_avail_lock);
> >               spin_lock(&si->lock);
> > -             if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
> > -                     spin_lock(&swap_avail_lock);
> > -                     if (plist_node_empty(&si->avail_lists[node])) {
> > -                             spin_unlock(&si->lock);
> > -                             goto nextsi;
> > -                     }
> > -                     WARN(!(si->flags & SWP_WRITEOK),
> > -                          "swap_info %d in list but !SWP_WRITEOK\n",
> > -                          si->type);
> > -                     __del_from_avail_list(si);
> > -                     spin_unlock(&si->lock);
> > -                     goto nextsi;
> > -             }
> >               n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
> >                                           n_goal, swp_entries, order);
> >               spin_unlock(&si->lock);
> > @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> >               cond_resched();
> >
> >               spin_lock(&swap_avail_lock);
> > -nextsi:
> >               /*
> >                * if we got here, it's likely that si was almost full before,
> >                * and since scan_swap_map_slots() can drop the si->lock,
> > @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
> >               if (sis->flags & SWP_WRITEOK) {
> >                       n = sis->pages;
> >                       if (free)
> > -                             n -= sis->inuse_pages;
> > +                             n -= swap_usage_in_pages(sis);
> >               }
> >               spin_unlock(&sis->lock);
> >       }
> > @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
> >       swp_entry_t entry;
> >       unsigned int i;
> >
> > -     if (!READ_ONCE(si->inuse_pages))
> > +     if (!swap_usage_in_pages(si))
> >               goto success;
> >
> >  retry:
> > @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
> >
> >       spin_lock(&mmlist_lock);
> >       p = &init_mm.mmlist;
> > -     while (READ_ONCE(si->inuse_pages) &&
> > +     while (swap_usage_in_pages(si) &&
> >              !signal_pending(current) &&
> >              (p = p->next) != &init_mm.mmlist) {
> >
> > @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
> >       mmput(prev_mm);
> >
> >       i = 0;
> > -     while (READ_ONCE(si->inuse_pages) &&
> > +     while (swap_usage_in_pages(si) &&
> >              !signal_pending(current) &&
> >              (i = find_next_to_unuse(si, i)) != 0) {
> >
> > @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
> >        * folio_alloc_swap(), temporarily hiding that swap.  It's easy
> >        * and robust (though cpu-intensive) just to keep retrying.
> >        */
> > -     if (READ_ONCE(si->inuse_pages)) {
> > +     if (swap_usage_in_pages(si)) {
> >               if (!signal_pending(current))
> >                       goto retry;
> >               return -EINTR;
> > @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
> >  success:
> >       /*
> >        * Make sure that further cleanups after try_to_unuse() returns happen
> > -      * after swap_range_free() reduces si->inuse_pages to 0.
> > +      * after swap_range_free() reduces inuse_pages to 0.
>
> Here, I personally think the original si->inuse_pages may be better.

I updated this comment to avoid people from mis-using it directly,
anyway it's a trivial comment, can keep it unchanged.