Chris Li <chrisl@xxxxxxxxxx> writes: > Track the nonfull cluster as well as the empty cluster > on lists. Each order has one nonfull cluster list. > > The cluster will remember which order it was used during > new cluster allocation. > > When the cluster has free entry, add to the nonfull[order] > list. When the free cluster list is empty, also allocate > from the nonempty list of that order. > > This improves the mTHP swap allocation success rate. > > There are limitations if the distribution of numbers of > different orders of mTHP changes a lot. e.g. there are a lot > of nonfull cluster assign to order A while later time there > are a lot of order B allocation while very little allocation > in order A. Currently the cluster used by order A will not > reused by order B unless the cluster is 100% empty. > > Signed-off-by: Chris Li <chrisl@xxxxxxxxxx> > --- > include/linux/swap.h | 4 ++++ > mm/swapfile.c | 38 +++++++++++++++++++++++++++++++++++--- > 2 files changed, 39 insertions(+), 3 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index edafd52d7ac4..6716ef236766 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -254,9 +254,11 @@ struct swap_cluster_info { > */ > u16 count; > u8 flags; > + u8 order; > struct list_head list; > }; > #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ > +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ > > /* > * The first page in the swap file is the swap header, which is always marked > @@ -295,6 +297,8 @@ struct swap_info_struct { > unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > struct list_head free_clusters; /* free clusters list */ > + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; > + /* list of cluster that contains at least one free slot */ > unsigned int lowest_bit; /* index of first free in swap_map */ > unsigned int highest_bit; /* index of last free in swap_map */ > unsigned int pages; /* total of usable pages of swap */ > diff --git a/mm/swapfile.c b/mm/swapfile.c > index bceead7f9e3c..dcf09eb549db 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -361,14 +361,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > > - list_add_tail(&ci->list, &si->discard_clusters); > + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); > + if (ci->flags & CLUSTER_FLAG_NONFULL) > + list_move_tail(&ci->list, &si->discard_clusters); > + else > + list_add_tail(&ci->list, &si->discard_clusters); > + ci->flags = 0; As Ryan pointed out before, it's better to clear the specific bit instead of assigning 0. This will make code future proof. > schedule_work(&si->discard_work); > } > > static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) > { > + if (ci->flags & CLUSTER_FLAG_NONFULL) > + list_move_tail(&ci->list, &si->free_clusters); > + else > + list_add_tail(&ci->list, &si->free_clusters); > ci->flags = CLUSTER_FLAG_FREE; > - list_add_tail(&ci->list, &si->free_clusters); > } > > /* > @@ -491,8 +499,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste > VM_BUG_ON(ci->count == 0); > ci->count--; > > - if (!ci->count) > + if (!ci->count) { > free_cluster(p, ci); > + return; > + } > + > + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { > + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); > + ci->flags |= CLUSTER_FLAG_NONFULL; > + } > } > > /* > @@ -553,6 +568,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > if (tmp == SWAP_NEXT_INVALID) { > if (!list_empty(&si->free_clusters)) { > ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > + list_del(&ci->list); > + spin_lock(&ci->lock); > + ci->order = order; > + ci->flags = 0; > + spin_unlock(&ci->lock); > + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; > + } else if (!list_empty(&si->nonfull_clusters[order])) { > + ci = list_first_entry(&si->nonfull_clusters[order], > + struct swap_cluster_info, list); > + list_del(&ci->list); > + spin_lock(&ci->lock); > + ci->flags = 0; > + spin_unlock(&ci->lock); > tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; > } else if (!list_empty(&si->discard_clusters)) { We should check discard_clusters before nonfull clusters. > /* > @@ -967,6 +995,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > ci = lock_cluster(si, offset); > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > ci->count = 0; > + ci->order = 0; > ci->flags = 0; > free_cluster(si, ci); > unlock_cluster(ci); > @@ -2922,6 +2951,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, > INIT_LIST_HEAD(&p->free_clusters); > INIT_LIST_HEAD(&p->discard_clusters); > > + for (i = 0; i < SWAP_NR_ORDERS; i++) > + INIT_LIST_HEAD(&p->nonfull_clusters[i]); > + > for (i = 0; i < swap_header->info.nr_badpages; i++) { > unsigned int page_nr = swap_header->info.badpages[i]; > if (page_nr == 0 || page_nr > swap_header->info.last_page) -- Best Regards, Huang, Ying