Re: [PATCH v5 8/9] mm: swap: relaim the cached parts that got scanned

Chris Li <chrisl@xxxxxxxxxx> · Tue, 6 Aug 2024 15:34:32 -0700

Hi Kairui,

On Tue, Jul 30, 2024 at 11:49 PM <chrisl@xxxxxxxxxx> wrote:
>
> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> This commit implements reclaim during scan for cluster allocator.
>
> Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which
> could result in low allocation success rate or early OOM.
>
> So to ensure maximum allocation success rate, integrate reclaiming
> with scanning. If found a range of suitable swap slots but fragmented
> due to HAS_CACHE, just try to reclaim the slots.
>
> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
>  include/linux/swap.h |   1 +
>  mm/swapfile.c        | 140 +++++++++++++++++++++++++++++++++++++++------------
>  2 files changed, 110 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 5a14b6c65949..9eb740563d63 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -302,6 +302,7 @@ struct swap_info_struct {
>                                         /* list of cluster that contains at least one free slot */
>         struct list_head frag_clusters[SWAP_NR_ORDERS];
>                                         /* list of cluster that are fragmented or contented */
> +       unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
>         unsigned int lowest_bit;        /* index of first free in swap_map */
>         unsigned int highest_bit;       /* index of last free in swap_map */
>         unsigned int pages;             /* total of usable pages of swap */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index eb3e387e86b2..50e7f600a9a1 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
>         VM_BUG_ON(ci->count != 0);
>         lockdep_assert_held(&si->lock);
>         lockdep_assert_held(&ci->lock);
> +
> +       if (ci->flags & CLUSTER_FLAG_FRAG)
> +               si->frag_cluster_nr[ci->order]--;
> +
>         /*
>          * If the swap is discardable, prepare discard the cluster
>          * instead of free it immediately. The cluster will be freed
> @@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
>
>         if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
>                 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
> -               if (ci->flags & CLUSTER_FLAG_FRAG)
> +               if (ci->flags & CLUSTER_FLAG_FRAG) {
> +                       p->frag_cluster_nr[ci->order]--;
>                         list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
> -               else
> +               } else {
>                         list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
> +               }
>                 ci->flags = CLUSTER_FLAG_NONFULL;
>         }
>  }
>
> -static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
> -                                     unsigned int nr_pages)
> +static bool cluster_reclaim_range(struct swap_info_struct *si,
> +                                 struct swap_cluster_info *ci,
> +                                 unsigned long start, unsigned long end)
>  {
> -       unsigned char *p = si->swap_map + start;
> -       unsigned char *end = p + nr_pages;
> +       unsigned char *map = si->swap_map;
> +       unsigned long offset;
> +
> +       spin_unlock(&ci->lock);
> +       spin_unlock(&si->lock);
> +
> +       for (offset = start; offset < end; offset++) {
> +               switch (READ_ONCE(map[offset])) {
> +               case 0:
> +                       continue;
> +               case SWAP_HAS_CACHE:
> +                       if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
> +                               continue;
> +                       goto out;
> +               default:
> +                       goto out;
> +               }
> +       }
> +out:
> +       spin_lock(&si->lock);
> +       spin_lock(&ci->lock);
>
> -       while (p < end)
> -               if (*p++)
> +       /*
> +        * Recheck the range no matter reclaim succeeded or not, the slot
> +        * could have been be freed while we are not holding the lock.
> +        */
> +       for (offset = start; offset < end; offset++)
> +               if (READ_ONCE(map[offset]))
>                         return false;
>
>         return true;
>  }
>
> +static bool cluster_scan_range(struct swap_info_struct *si,
> +                              struct swap_cluster_info *ci,
> +                              unsigned long start, unsigned int nr_pages)
> +{
> +       unsigned long offset, end = start + nr_pages;
> +       unsigned char *map = si->swap_map;
> +       bool need_reclaim = false;
>
> -static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
> -                                               unsigned int start, unsigned char usage,
> -                                               unsigned int order)
> +       for (offset = start; offset < end; offset++) {
> +               switch (READ_ONCE(map[offset])) {
> +               case 0:
> +                       continue;
> +               case SWAP_HAS_CACHE:
> +                       if (!vm_swap_full())
> +                               return false;
> +                       need_reclaim = true;
> +                       continue;
> +               default:
> +                       return false;
> +               }
> +       }
> +
> +       if (need_reclaim)
> +               return cluster_reclaim_range(si, ci, start, end);
> +
> +       return true;
> +}
> +
> +static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
> +                               unsigned int start, unsigned char usage,
> +                               unsigned int order)
>  {
>         unsigned int nr_pages = 1 << order;
>
> @@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
>         if (ci->count == SWAPFILE_CLUSTER) {
>                 VM_BUG_ON(!(ci->flags &
>                           (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
> +               if (ci->flags & CLUSTER_FLAG_FRAG)
> +                       si->frag_cluster_nr[ci->order]--;
>                 list_del(&ci->list);
>                 ci->flags = 0;
>         }
> @@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
>         }
>
>         while (offset <= end) {
> -               if (cluster_scan_range(si, offset, nr_pages)) {
> +               if (cluster_scan_range(si, ci, offset, nr_pages)) {
>                         cluster_alloc_range(si, ci, offset, usage, order);
>                         *foundp = offset;
>                         if (ci->count == SWAPFILE_CLUSTER) {
> @@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>                                               unsigned char usage)
>  {
>         struct percpu_cluster *cluster;
> -       struct swap_cluster_info *ci, *n;
> +       struct swap_cluster_info *ci;
>         unsigned int offset, found = 0;
> -       LIST_HEAD(fraged);
>
>  new_cluster:
>         lockdep_assert_held(&si->lock);
> @@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>         }
>
>         if (order < PMD_ORDER) {
> -               list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
> -                       list_move_tail(&ci->list, &fraged);
> +               unsigned int frags = 0;
> +
> +               while (!list_empty(&si->nonfull_clusters[order])) {
> +                       ci = list_first_entry(&si->nonfull_clusters[order],
> +                                             struct swap_cluster_info, list);
> +                       list_move_tail(&ci->list, &si->frag_clusters[order]);
>                         ci->flags = CLUSTER_FLAG_FRAG;
> +                       si->frag_cluster_nr[order]++;
>                         offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
>                                                          &found, order, usage);
> +                       frags++;
>                         if (found)
>                                 break;
>                 }

One minor nitpick here.
We removed the line  "list_splice_tail(&fraged,
&si->frag_clusters[order]);" in the later part of the patch.
We can simplify this goto as:

if (found)
          goto done;

>
>                 if (!found) {

Then we can remove this test because the above goto done.
Save one level of indentation.

No functional change, we can address this in the next iteration.

> -                       list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
> +                       /*
> +                        * Nonfull clusters are moved to frag tail if we reached
> +                        * here, count them too, don't over scan the frag list.
> +                        */
> +                       while (frags < si->frag_cluster_nr[order]) {
> +                               ci = list_first_entry(&si->frag_clusters[order],
> +                                                     struct swap_cluster_info, list);
> +                               /*
> +                                * Rotate the frag list to iterate, they were all failing
> +                                * high order allocation or moved here due to per-CPU usage,
> +                                * this help keeping usable cluster ahead.
> +                                */
> +                               list_move_tail(&ci->list, &si->frag_clusters[order]);
>                                 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
>                                                                  &found, order, usage);
> +                               frags++;
>                                 if (found)
>                                         break;
>                         }
>                 }
> -
> -               list_splice_tail(&fraged, &si->frag_clusters[order]);

This is the line that gets removed so we can use "goto done" instead.

Chris

>         }
>
>         if (found)
> @@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>
>         /* Order 0 stealing from higher order */
>         for (int o = 1; o < PMD_ORDER; o++) {
> -               if (!list_empty(&si->frag_clusters[o])) {
> +               /*
> +                * Clusters here have at least one usable slots and can't fail order 0
> +                * allocation, but reclaim may drop si->lock and race with another user.
> +                */
> +               while (!list_empty(&si->frag_clusters[o])) {
>                         ci = list_first_entry(&si->frag_clusters[o],
>                                               struct swap_cluster_info, list);
> -                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
> -                                                        0, usage);
> -                       VM_BUG_ON(!found);
> -                       goto done;
> +                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> +                                                        &found, 0, usage);
> +                       if (found)
> +                               goto done;
>                 }
>
> -               if (!list_empty(&si->nonfull_clusters[o])) {
> -                       ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
> -                                             list);
> +               while (!list_empty(&si->nonfull_clusters[o])) {
> +                       ci = list_first_entry(&si->nonfull_clusters[o],
> +                                             struct swap_cluster_info, list);
>                         offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
>                                                          &found, 0, usage);
> -                       VM_BUG_ON(!found);
> -                       goto done;
> +                       if (found)
> +                               goto done;
>                 }
>         }
> -
>  done:
>         cluster->next[order] = offset;
>         return found;
> @@ -3053,6 +3131,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>         for (i = 0; i < SWAP_NR_ORDERS; i++) {
>                 INIT_LIST_HEAD(&p->nonfull_clusters[i]);
>                 INIT_LIST_HEAD(&p->frag_clusters[i]);
> +               p->frag_cluster_nr[i] = 0;
>         }
>
>         for (i = 0; i < swap_header->info.nr_badpages; i++) {
> @@ -3096,7 +3175,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
>         if (!cluster_info)
>                 return nr_extents;
>
> -
>         /*
>          * Reduce false cache line sharing between cluster_info and
>          * sharing same address space.
>
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>