On 12/31/24 at 01:46am, Kairui Song wrote: ......snip..... > --- > include/linux/swap.h | 3 +- > mm/swapfile.c | 435 ++++++++++++++++++++++++------------------- > 2 files changed, 246 insertions(+), 192 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 339d7f0192ff..c4ff31cb6bde 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -291,6 +291,7 @@ enum swap_cluster_flags { > * throughput. > */ > struct percpu_cluster { > + local_lock_t lock; /* Protect the percpu_cluster above */ > unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ > }; > > @@ -313,7 +314,7 @@ struct swap_info_struct { > /* list of cluster that contains at least one free slot */ > struct list_head frag_clusters[SWAP_NR_ORDERS]; > /* list of cluster that are fragmented or contented */ > - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; > + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; > unsigned int pages; /* total of usable pages of swap */ > atomic_long_t inuse_pages; /* number of those currently in use */ > struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 7795a3d27273..dadd4fead689 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -261,12 +261,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > folio_ref_sub(folio, nr_pages); > folio_set_dirty(folio); > > - spin_lock(&si->lock); > /* Only sinple page folio can be backed by zswap */ > if (nr_pages == 1) > zswap_invalidate(entry); > swap_entry_range_free(si, entry, nr_pages); > - spin_unlock(&si->lock); > ret = nr_pages; > out_unlock: > folio_unlock(folio); > @@ -403,7 +401,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, > > static inline bool cluster_is_free(struct swap_cluster_info *info) > { > - return info->flags == CLUSTER_FLAG_FREE; > + return info->count == 0; This is a little confusing. Maybe we should add one and call it cluster_is_empty(). Because discarded clusters are also be able to pass the checking here. > +} > + > +static inline bool cluster_is_discard(struct swap_cluster_info *info) > +{ > + return info->flags == CLUSTER_FLAG_DISCARD; > +} > + > +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) > +{ > + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) > + return false; > + if (!order) > + return true; > + return cluster_is_free(ci) || order == ci->order; > } > > static inline unsigned int cluster_index(struct swap_info_struct *si, > @@ -440,19 +452,20 @@ static void cluster_move(struct swap_info_struct *si, > { > VM_WARN_ON(ci->flags == new_flags); > BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); > + lockdep_assert_held(&ci->lock); > > - if (ci->flags == CLUSTER_FLAG_NONE) { > + spin_lock(&si->lock); > + if (ci->flags == CLUSTER_FLAG_NONE) > list_add_tail(&ci->list, list); > - } else { > - if (ci->flags == CLUSTER_FLAG_FRAG) { > - VM_WARN_ON(!si->frag_cluster_nr[ci->order]); > - si->frag_cluster_nr[ci->order]--; > - } > + else > list_move_tail(&ci->list, list); > - } > + spin_unlock(&si->lock); > + > + if (ci->flags == CLUSTER_FLAG_FRAG) > + atomic_long_dec(&si->frag_cluster_nr[ci->order]); > + else if (new_flags == CLUSTER_FLAG_FRAG) > + atomic_long_inc(&si->frag_cluster_nr[ci->order]); > ci->flags = new_flags; > - if (new_flags == CLUSTER_FLAG_FRAG) > - si->frag_cluster_nr[ci->order]++; > } > > /* Add a cluster to discard list and schedule it to do discard */ > @@ -475,39 +488,90 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) > { > - lockdep_assert_held(&si->lock); > lockdep_assert_held(&ci->lock); > cluster_move(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); > ci->order = 0; > } > > +/* > + * Isolate and lock the first cluster that is not contented on a list, > + * clean its flag before taken off-list. Cluster flag must be in sync > + * with list status, so cluster updaters can always know the cluster > + * list status without touching si lock. > + * > + * Note it's possible that all clusters on a list are contented so > + * this returns NULL for an non-empty list. > + */ > +static struct swap_cluster_info *cluster_isolate_lock( > + struct swap_info_struct *si, struct list_head *list) > +{ > + struct swap_cluster_info *ci, *ret = NULL; > + > + spin_lock(&si->lock); > + > + if (unlikely(!(si->flags & SWP_WRITEOK))) > + goto out; > + > + list_for_each_entry(ci, list, list) { > + if (!spin_trylock(&ci->lock)) > + continue; > + > + /* We may only isolate and clear flags of following lists */ > + VM_BUG_ON(!ci->flags); > + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && > + ci->flags != CLUSTER_FLAG_FULL); > + > + list_del(&ci->list); > + ci->flags = CLUSTER_FLAG_NONE; > + ret = ci; > + break; > + } > +out: > + spin_unlock(&si->lock); > + > + return ret; > +} > + > /* > * Doing discard actually. After a cluster discard is finished, the cluster > - * will be added to free cluster list. caller should hold si->lock. > -*/ > -static void swap_do_scheduled_discard(struct swap_info_struct *si) > + * will be added to free cluster list. Discard cluster is a bit special as > + * they don't participate in allocation or reclaim, so clusters marked as > + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. > + */ > +static bool swap_do_scheduled_discard(struct swap_info_struct *si) > { > struct swap_cluster_info *ci; > + bool ret = false; > unsigned int idx; > > + spin_lock(&si->lock); > while (!list_empty(&si->discard_clusters)) { > ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); > + /* > + * Delete the cluster from list but don't clear its flags until > + * discard is done, so isolation and relocation will skip it. > + */ > list_del(&ci->list); I don't understand above comment. ci has been taken off list. While allocation need isolate from a usable list. Even though we clear ci->flags now, how come isolation and relocation will touch it. I may miss anything here. > - /* Must clear flag when taking a cluster off-list */ > - ci->flags = CLUSTER_FLAG_NONE; > idx = cluster_index(si, ci); > spin_unlock(&si->lock); > - > discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, > SWAPFILE_CLUSTER); > > - spin_lock(&si->lock); > spin_lock(&ci->lock); > - __free_cluster(si, ci); > + /* > + * Discard is done, clear its flags as it's now off-list, > + * then return the cluster to allocation list. > + */ > + ci->flags = CLUSTER_FLAG_NONE; > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > 0, SWAPFILE_CLUSTER); > + __free_cluster(si, ci); > spin_unlock(&ci->lock); > + ret = true; > + spin_lock(&si->lock); > } > + spin_unlock(&si->lock); > + return ret; > } > > static void swap_discard_work(struct work_struct *work) ......snip.... > @@ -791,29 +873,34 @@ static void swap_reclaim_work(struct work_struct *work) > static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, > unsigned char usage) > { > - struct percpu_cluster *cluster; > struct swap_cluster_info *ci; > unsigned int offset, found = 0; > > -new_cluster: > - lockdep_assert_held(&si->lock); > - cluster = this_cpu_ptr(si->percpu_cluster); > - offset = cluster->next[order]; > + /* Fast path using per CPU cluster */ > + local_lock(&si->percpu_cluster->lock); > + offset = __this_cpu_read(si->percpu_cluster->next[order]); > if (offset) { > - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); > + ci = lock_cluster(si, offset); > + /* Cluster could have been used by another order */ > + if (cluster_is_usable(ci, order)) { > + if (cluster_is_free(ci)) > + offset = cluster_offset(si, ci); > + offset = alloc_swap_scan_cluster(si, offset, &found, > + order, usage); > + } else { > + unlock_cluster(ci); > + } > if (found) > goto done; > } > > - if (!list_empty(&si->free_clusters)) { > - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); > - /* > - * Either we didn't touch the cluster due to swapoff, > - * or the allocation must success. > - */ > - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); > - goto done; > +new_cluster: > + ci = cluster_isolate_lock(si, &si->free_clusters); > + if (ci) { > + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > + &found, order, usage); > + if (found) > + goto done; > } > > /* Try reclaim from full clusters if free clusters list is drained */ > @@ -821,49 +908,45 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > swap_reclaim_full_clusters(si, false); > > if (order < PMD_ORDER) { > - unsigned int frags = 0; > + unsigned int frags = 0, frags_existing; > > - while (!list_empty(&si->nonfull_clusters[order])) { > - ci = list_first_entry(&si->nonfull_clusters[order], > - struct swap_cluster_info, list); > - cluster_move(si, ci, &si->frag_clusters[order], CLUSTER_FLAG_FRAG); > + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) { > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > &found, order, usage); > - frags++; > + /* > + * With `fragmenting` set to true, it will surely take ~~~~~~~~~~~ wondering what 'fragmenting' means here. > + * the cluster off nonfull list > + */ > if (found) > goto done; > + frags++; > } > > - /* > - * Nonfull clusters are moved to frag tail if we reached > - * here, count them too, don't over scan the frag list. > - */ > - while (frags < si->frag_cluster_nr[order]) { > - ci = list_first_entry(&si->frag_clusters[order], > - struct swap_cluster_info, list); > + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); > + while (frags < frags_existing && > + (ci = cluster_isolate_lock(si, &si->frag_clusters[order]))) { > + atomic_long_dec(&si->frag_cluster_nr[order]); > /* > - * Rotate the frag list to iterate, they were all failing > - * high order allocation or moved here due to per-CPU usage, > - * this help keeping usable cluster ahead. > + * Rotate the frag list to iterate, they were all > + * failing high order allocation or moved here due to > + * per-CPU usage, but they could contain newly released > + * reclaimable (eg. lazy-freed swap cache) slots. > */ > - list_move_tail(&ci->list, &si->frag_clusters[order]); > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), > &found, order, usage); > - frags++; > if (found) > goto done; > + frags++; > } > } > > - if (!list_empty(&si->discard_clusters)) { > - /* > - * we don't have free cluster but have some clusters in > - * discarding, do discard now and reclaim them, then > - * reread cluster_next_cpu since we dropped si->lock > - */ > - swap_do_scheduled_discard(si); > + /* > + * We don't have free cluster but have some clusters in > + * discarding, do discard now and reclaim them, then > + * reread cluster_next_cpu since we dropped si->lock > + */ > + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) > goto new_cluster; > - } > > if (order) > goto done; .....