Hi Kairui, On Tue, Jul 30, 2024 at 11:49 PM <chrisl@xxxxxxxxxx> wrote: > > From: Kairui Song <kasong@xxxxxxxxxxx> > > This commit implements reclaim during scan for cluster allocator. > > Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which > could result in low allocation success rate or early OOM. > > So to ensure maximum allocation success rate, integrate reclaiming > with scanning. If found a range of suitable swap slots but fragmented > due to HAS_CACHE, just try to reclaim the slots. > > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > include/linux/swap.h | 1 + > mm/swapfile.c | 140 +++++++++++++++++++++++++++++++++++++++------------ > 2 files changed, 110 insertions(+), 31 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 5a14b6c65949..9eb740563d63 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -302,6 +302,7 @@ struct swap_info_struct { > /* list of cluster that contains at least one free slot */ > struct list_head frag_clusters[SWAP_NR_ORDERS]; > /* list of cluster that are fragmented or contented */ > + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; > unsigned int lowest_bit; /* index of first free in swap_map */ > unsigned int highest_bit; /* index of last free in swap_map */ > unsigned int pages; /* total of usable pages of swap */ > diff --git a/mm/swapfile.c b/mm/swapfile.c > index eb3e387e86b2..50e7f600a9a1 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * > VM_BUG_ON(ci->count != 0); > lockdep_assert_held(&si->lock); > lockdep_assert_held(&ci->lock); > + > + if (ci->flags & CLUSTER_FLAG_FRAG) > + si->frag_cluster_nr[ci->order]--; > + > /* > * If the swap is discardable, prepare discard the cluster > * instead of free it immediately. The cluster will be freed > @@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p, > > if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { > VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); > - if (ci->flags & CLUSTER_FLAG_FRAG) > + if (ci->flags & CLUSTER_FLAG_FRAG) { > + p->frag_cluster_nr[ci->order]--; > list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); > - else > + } else { > list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); > + } > ci->flags = CLUSTER_FLAG_NONFULL; > } > } > > -static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, > - unsigned int nr_pages) > +static bool cluster_reclaim_range(struct swap_info_struct *si, > + struct swap_cluster_info *ci, > + unsigned long start, unsigned long end) > { > - unsigned char *p = si->swap_map + start; > - unsigned char *end = p + nr_pages; > + unsigned char *map = si->swap_map; > + unsigned long offset; > + > + spin_unlock(&ci->lock); > + spin_unlock(&si->lock); > + > + for (offset = start; offset < end; offset++) { > + switch (READ_ONCE(map[offset])) { > + case 0: > + continue; > + case SWAP_HAS_CACHE: > + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) > + continue; > + goto out; > + default: > + goto out; > + } > + } > +out: > + spin_lock(&si->lock); > + spin_lock(&ci->lock); > > - while (p < end) > - if (*p++) > + /* > + * Recheck the range no matter reclaim succeeded or not, the slot > + * could have been be freed while we are not holding the lock. > + */ > + for (offset = start; offset < end; offset++) > + if (READ_ONCE(map[offset])) > return false; > > return true; > } > > +static bool cluster_scan_range(struct swap_info_struct *si, > + struct swap_cluster_info *ci, > + unsigned long start, unsigned int nr_pages) > +{ > + unsigned long offset, end = start + nr_pages; > + unsigned char *map = si->swap_map; > + bool need_reclaim = false; > > -static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, > - unsigned int start, unsigned char usage, > - unsigned int order) > + for (offset = start; offset < end; offset++) { > + switch (READ_ONCE(map[offset])) { > + case 0: > + continue; > + case SWAP_HAS_CACHE: > + if (!vm_swap_full()) > + return false; > + need_reclaim = true; > + continue; > + default: > + return false; > + } > + } > + > + if (need_reclaim) > + return cluster_reclaim_range(si, ci, start, end); > + > + return true; > +} > + > +static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, > + unsigned int start, unsigned char usage, > + unsigned int order) > { > unsigned int nr_pages = 1 << order; > > @@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ > if (ci->count == SWAPFILE_CLUSTER) { > VM_BUG_ON(!(ci->flags & > (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); > + if (ci->flags & CLUSTER_FLAG_FRAG) > + si->frag_cluster_nr[ci->order]--; > list_del(&ci->list); > ci->flags = 0; > } > @@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne > } > > while (offset <= end) { > - if (cluster_scan_range(si, offset, nr_pages)) { > + if (cluster_scan_range(si, ci, offset, nr_pages)) { > cluster_alloc_range(si, ci, offset, usage, order); > *foundp = offset; > if (ci->count == SWAPFILE_CLUSTER) { > @@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > unsigned char usage) > { > struct percpu_cluster *cluster; > - struct swap_cluster_info *ci, *n; > + struct swap_cluster_info *ci; > unsigned int offset, found = 0; > - LIST_HEAD(fraged); > > new_cluster: > lockdep_assert_held(&si->lock); > @@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > } > > if (order < PMD_ORDER) { > - list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { > - list_move_tail(&ci->list, &fraged); > + unsigned int frags = 0; > + > + while (!list_empty(&si->nonfull_clusters[order])) { > + ci = list_first_entry(&si->nonfull_clusters[order], > + struct swap_cluster_info, list); > + list_move_tail(&ci->list, &si->frag_clusters[order]); > ci->flags = CLUSTER_FLAG_FRAG; > + si->frag_cluster_nr[order]++; > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > &found, order, usage); > + frags++; > if (found) > break; > } One minor nitpick here. We removed the line "list_splice_tail(&fraged, &si->frag_clusters[order]);" in the later part of the patch. We can simplify this goto as: if (found) goto done; > > if (!found) { Then we can remove this test because the above goto done. Save one level of indentation. No functional change, we can address this in the next iteration. > - list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { > + /* > + * Nonfull clusters are moved to frag tail if we reached > + * here, count them too, don't over scan the frag list. > + */ > + while (frags < si->frag_cluster_nr[order]) { > + ci = list_first_entry(&si->frag_clusters[order], > + struct swap_cluster_info, list); > + /* > + * Rotate the frag list to iterate, they were all failing > + * high order allocation or moved here due to per-CPU usage, > + * this help keeping usable cluster ahead. > + */ > + list_move_tail(&ci->list, &si->frag_clusters[order]); > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > &found, order, usage); > + frags++; > if (found) > break; > } > } > - > - list_splice_tail(&fraged, &si->frag_clusters[order]); This is the line that gets removed so we can use "goto done" instead. Chris > } > > if (found) > @@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > > /* Order 0 stealing from higher order */ > for (int o = 1; o < PMD_ORDER; o++) { > - if (!list_empty(&si->frag_clusters[o])) { > + /* > + * Clusters here have at least one usable slots and can't fail order 0 > + * allocation, but reclaim may drop si->lock and race with another user. > + */ > + while (!list_empty(&si->frag_clusters[o])) { > ci = list_first_entry(&si->frag_clusters[o], > struct swap_cluster_info, list); > - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, > - 0, usage); > - VM_BUG_ON(!found); > - goto done; > + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > + &found, 0, usage); > + if (found) > + goto done; > } > > - if (!list_empty(&si->nonfull_clusters[o])) { > - ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, > - list); > + while (!list_empty(&si->nonfull_clusters[o])) { > + ci = list_first_entry(&si->nonfull_clusters[o], > + struct swap_cluster_info, list); > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > &found, 0, usage); > - VM_BUG_ON(!found); > - goto done; > + if (found) > + goto done; > } > } > - > done: > cluster->next[order] = offset; > return found; > @@ -3053,6 +3131,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, > for (i = 0; i < SWAP_NR_ORDERS; i++) { > INIT_LIST_HEAD(&p->nonfull_clusters[i]); > INIT_LIST_HEAD(&p->frag_clusters[i]); > + p->frag_cluster_nr[i] = 0; > } > > for (i = 0; i < swap_header->info.nr_badpages; i++) { > @@ -3096,7 +3175,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, > if (!cluster_info) > return nr_extents; > > - > /* > * Reduce false cache line sharing between cluster_info and > * sharing same address space. > > -- > 2.46.0.rc1.232.g9752f9e123-goog >