Zhiguo reported that swap release could be a serious bottleneck during process exits[1]. With mTHP, we have the opportunity to batch free swaps. Thanks to the work of Chris and Kairui[2], I was able to achieve this optimization with minimal code changes by building on their efforts. [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@xxxxxxxx/ [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@xxxxxxxxxx/ Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx> --- mm/swapfile.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index ea023fc25d08..9def6dba8d26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si, return true; } +static bool swap_is_last_map(struct swap_info_struct *si, + unsigned long offset, int nr_pages, + bool *any_only_cache) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + bool cached = false; + + do { + if ((*map & ~SWAP_HAS_CACHE) != 1) + return false; + if (*map & SWAP_HAS_CACHE) + cached = true; + } while (++map < map_end); + + *any_only_cache = cached; + return true; +} + /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no @@ -1808,6 +1827,29 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) if (WARN_ON(end_offset > si->max)) goto out; + if (nr > 1) { + struct swap_cluster_info *ci; + bool batched_free; + int i; + + ci = lock_cluster_or_swap_info(si, start_offset); + if ((batched_free = swap_is_last_map(si, start_offset, nr, &any_only_cache))) { + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[start_offset + i], SWAP_HAS_CACHE); + } + unlock_cluster_or_swap_info(si, ci); + + if (batched_free) { + spin_lock(&si->lock); + pr_err("%s offset:%lx nr:%lx\n", __func__,start_offset, nr); + swap_entry_range_free(si, entry, nr); + spin_unlock(&si->lock); + if (any_only_cache) + goto reclaim; + goto out; + } + } + /* * First free all entries in the range. */ @@ -1828,6 +1870,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) if (!any_only_cache) goto out; +reclaim: /* * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim -- 2.34.1 > --- > Â mm/swapfile.c | 59 ++++++++++++++++++++++++++--------------------------------- > Â 1 file changed, 26 insertions(+), 33 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 34e6ea13e8e4..9b63b2262cc2 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -479,20 +479,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p, > Â } > > Â /* > - * The cluster ci decreases one usage. If the usage counter becomes 0, > + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, > Â * which means no page in the cluster is in use, we can optionally discard > Â * the cluster and add it to free cluster list. > Â */ > -static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) > +static void dec_cluster_info_page(struct swap_info_struct *p, > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct swap_cluster_info *ci, int nr_pages) > Â { > Â Â Â Â if (!p->cluster_info) > Â Â Â Â Â Â Â Â return; > > - Â Â Â VM_BUG_ON(ci->count == 0); > + Â Â Â VM_BUG_ON(ci->count < nr_pages); > Â Â Â Â VM_BUG_ON(cluster_is_free(ci)); > Â Â Â Â lockdep_assert_held(&p->lock); > Â Â Â Â lockdep_assert_held(&ci->lock); > - Â Â Â ci->count--; > + Â Â Â ci->count -= nr_pages; > > Â Â Â Â if (!ci->count) { > Â Â Â Â Â Â Â Â free_cluster(p, ci); > @@ -998,19 +999,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, > Â Â Â Â return n_ret; > Â } > > -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > -{ > - Â Â Â unsigned long offset = idx * SWAPFILE_CLUSTER; > - Â Â Â struct swap_cluster_info *ci; > - > - Â Â Â ci = lock_cluster(si, offset); > - Â Â Â memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > - Â Â Â ci->count = 0; > - Â Â Â free_cluster(si, ci); > - Â Â Â unlock_cluster(ci); > - Â Â Â swap_range_free(si, offset, SWAPFILE_CLUSTER); > -} > - > Â int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > Â { > Â Â Â Â int order = swap_entry_order(entry_order); > @@ -1269,21 +1257,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, > Â Â Â Â return usage; > Â } > > -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) > +/* > + * Drop the last HAS_CACHE flag of swap entries, caller have to > + * ensure all entries belong to the same cgroup. > + */ > +static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â unsigned int nr_pages) > Â { > - Â Â Â struct swap_cluster_info *ci; > Â Â Â Â unsigned long offset = swp_offset(entry); > - Â Â Â unsigned char count; > + Â Â Â unsigned char *map = p->swap_map + offset; > + Â Â Â unsigned char *map_end = map + nr_pages; > + Â Â Â struct swap_cluster_info *ci; > > Â Â Â Â ci = lock_cluster(p, offset); > - Â Â Â count = p->swap_map[offset]; > - Â Â Â VM_BUG_ON(count != SWAP_HAS_CACHE); > - Â Â Â p->swap_map[offset] = 0; > - Â Â Â dec_cluster_info_page(p, ci); > + Â Â Â do { > + Â Â Â Â Â Â Â VM_BUG_ON(*map != SWAP_HAS_CACHE); > + Â Â Â Â Â Â Â *map = 0; > + Â Â Â } while (++map < map_end); > + Â Â Â dec_cluster_info_page(p, ci, nr_pages); > Â Â Â Â unlock_cluster(ci); > > - Â Â Â mem_cgroup_uncharge_swap(entry, 1); > - Â Â Â swap_range_free(p, offset, 1); > + Â Â Â mem_cgroup_uncharge_swap(entry, nr_pages); > + Â Â Â swap_range_free(p, offset, nr_pages); > Â } > > Â static void cluster_swap_free_nr(struct swap_info_struct *sis, > @@ -1343,7 +1338,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) > Â void put_swap_folio(struct folio *folio, swp_entry_t entry) > Â { > Â Â Â Â unsigned long offset = swp_offset(entry); > - Â Â Â unsigned long idx = offset / SWAPFILE_CLUSTER; > Â Â Â Â struct swap_cluster_info *ci; > Â Â Â Â struct swap_info_struct *si; > Â Â Â Â unsigned char *map; > @@ -1356,19 +1350,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) > Â Â Â Â Â Â Â Â return; > > Â Â Â Â ci = lock_cluster_or_swap_info(si, offset); > - Â Â Â if (size == SWAPFILE_CLUSTER) { > + Â Â Â if (size > 1) { > Â Â Â Â Â Â Â Â map = si->swap_map + offset; > - Â Â Â Â Â Â Â for (i = 0; i < SWAPFILE_CLUSTER; i++) { > + Â Â Â Â Â Â Â for (i = 0; i < size; i++) { > Â Â Â Â Â Â Â Â Â Â Â Â val = map[i]; > Â Â Â Â Â Â Â Â Â Â Â Â VM_BUG_ON(!(val & SWAP_HAS_CACHE)); > Â Â Â Â Â Â Â Â Â Â Â Â if (val == SWAP_HAS_CACHE) > Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â free_entries++; > Â Â Â Â Â Â Â Â } > - Â Â Â Â Â Â Â if (free_entries == SWAPFILE_CLUSTER) { > + Â Â Â Â Â Â Â if (free_entries == size) { > Â Â Â Â Â Â Â Â Â Â Â Â unlock_cluster_or_swap_info(si, ci); > Â Â Â Â Â Â Â Â Â Â Â Â spin_lock(&si->lock); > - Â Â Â Â Â Â Â Â Â Â Â mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); > - Â Â Â Â Â Â Â Â Â Â Â swap_free_cluster(si, idx); > + Â Â Â Â Â Â Â Â Â Â Â swap_entry_range_free(si, entry, size); > Â Â Â Â Â Â Â Â Â Â Â Â spin_unlock(&si->lock); > Â Â Â Â Â Â Â Â Â Â Â Â return; > Â Â Â Â Â Â Â Â } > @@ -1413,7 +1406,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) > Â Â Â Â for (i = 0; i < n; ++i) { > Â Â Â Â Â Â Â Â p = swap_info_get_cont(entries[i], prev); > Â Â Â Â Â Â Â Â if (p) > - Â Â Â Â Â Â Â Â Â Â Â swap_entry_free(p, entries[i]); > + Â Â Â Â Â Â Â Â Â Â Â swap_entry_range_free(p, entries[i], 1); > Â Â Â Â Â Â Â Â prev = p; > Â Â Â Â } > Â Â Â Â if (p) > > -- > 2.46.0.rc1.232.g9752f9e123-goog > Thanks Barry