From: Kairui Song <kasong@xxxxxxxxxxx> This commit implements reclaim during scan for cluster allocator. Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could result in low allocation success rate or early OOM. So to ensure maximum allocation success rate, integrate reclaiming with scanning. If found a range of suitable swap slots but fragmented due to HAS_CACHE, just try to reclaim the slots. Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> --- include/linux/swap.h | 1 + mm/swapfile.c | 140 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 110 insertions(+), 31 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 5a14b6c65949..9eb740563d63 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -302,6 +302,7 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ diff --git a/mm/swapfile.c b/mm/swapfile.c index eb3e387e86b2..50e7f600a9a1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * VM_BUG_ON(ci->count != 0); lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); + + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p, if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - if (ci->flags & CLUSTER_FLAG_FRAG) + if (ci->flags & CLUSTER_FLAG_FRAG) { + p->frag_cluster_nr[ci->order]--; list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); - else + } else { list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); + } ci->flags = CLUSTER_FLAG_NONFULL; } } -static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, - unsigned int nr_pages) +static bool cluster_reclaim_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned long end) { - unsigned char *p = si->swap_map + start; - unsigned char *end = p + nr_pages; + unsigned char *map = si->swap_map; + unsigned long offset; + + spin_unlock(&ci->lock); + spin_unlock(&si->lock); + + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) + continue; + goto out; + default: + goto out; + } + } +out: + spin_lock(&si->lock); + spin_lock(&ci->lock); - while (p < end) - if (*p++) + /* + * Recheck the range no matter reclaim succeeded or not, the slot + * could have been be freed while we are not holding the lock. + */ + for (offset = start; offset < end; offset++) + if (READ_ONCE(map[offset])) return false; return true; } +static bool cluster_scan_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned int nr_pages) +{ + unsigned long offset, end = start + nr_pages; + unsigned char *map = si->swap_map; + bool need_reclaim = false; -static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned int start, unsigned char usage, - unsigned int order) + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (!vm_swap_full()) + return false; + need_reclaim = true; + continue; + default: + return false; + } + } + + if (need_reclaim) + return cluster_reclaim_range(si, ci, start, end); + + return true; +} + +static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) { unsigned int nr_pages = 1 << order; @@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ if (ci->count == SWAPFILE_CLUSTER) { VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; list_del(&ci->list); ci->flags = 0; } @@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne } while (offset <= end) { - if (cluster_scan_range(si, offset, nr_pages)) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { cluster_alloc_range(si, ci, offset, usage, order); *foundp = offset; if (ci->count == SWAPFILE_CLUSTER) { @@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o unsigned char usage) { struct percpu_cluster *cluster; - struct swap_cluster_info *ci, *n; + struct swap_cluster_info *ci; unsigned int offset, found = 0; - LIST_HEAD(fraged); new_cluster: lockdep_assert_held(&si->lock); @@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } if (order < PMD_ORDER) { - list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { - list_move_tail(&ci->list, &fraged); + unsigned int frags = 0; + + while (!list_empty(&si->nonfull_clusters[order])) { + ci = list_first_entry(&si->nonfull_clusters[order], + struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->frag_clusters[order]); ci->flags = CLUSTER_FLAG_FRAG; + si->frag_cluster_nr[order]++; offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + frags++; if (found) break; } if (!found) { - list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { + /* + * Nonfull clusters are moved to frag tail if we reached + * here, count them too, don't over scan the frag list. + */ + while (frags < si->frag_cluster_nr[order]) { + ci = list_first_entry(&si->frag_clusters[order], + struct swap_cluster_info, list); + /* + * Rotate the frag list to iterate, they were all failing + * high order allocation or moved here due to per-CPU usage, + * this help keeping usable cluster ahead. + */ + list_move_tail(&ci->list, &si->frag_clusters[order]); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + frags++; if (found) break; } } - - list_splice_tail(&fraged, &si->frag_clusters[order]); } if (found) @@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o /* Order 0 stealing from higher order */ for (int o = 1; o < PMD_ORDER; o++) { - if (!list_empty(&si->frag_clusters[o])) { + /* + * Clusters here have at least one usable slots and can't fail order 0 + * allocation, but reclaim may drop si->lock and race with another user. + */ + while (!list_empty(&si->frag_clusters[o])) { ci = list_first_entry(&si->frag_clusters[o], struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, - 0, usage); - VM_BUG_ON(!found); - goto done; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; } - if (!list_empty(&si->nonfull_clusters[o])) { - ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, - list); + while (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], + struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, 0, usage); - VM_BUG_ON(!found); - goto done; + if (found) + goto done; } } - done: cluster->next[order] = offset; return found; @@ -3053,6 +3131,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&p->nonfull_clusters[i]); INIT_LIST_HEAD(&p->frag_clusters[i]); + p->frag_cluster_nr[i] = 0; } for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3096,7 +3175,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (!cluster_info) return nr_extents; - /* * Reduce false cache line sharing between cluster_info and * sharing same address space. -- 2.46.0.rc1.232.g9752f9e123-goog