On Mon, Jul 15, 2024 at 7:57 AM Ryan Roberts <ryan.roberts@xxxxxxx> wrote: > > On 11/07/2024 08:29, Chris Li wrote: > > Previously, the swap cluster used a cluster index as a pointer > > to construct a custom single link list type "swap_cluster_list". > > The next cluster pointer is shared with the cluster->count. > > It prevents puting the non free cluster into a list. > > > > Change the cluster to use the standard double link list instead. > > This allows tracing the nonfull cluster in the follow up patch. > > That way, it is faster to get to the nonfull cluster of that order. > > > > Remove the cluster getter/setter for accessing the cluster > > struct member. > > > > The list operation is protected by the swap_info_struct->lock. > > > > Change cluster code to use "struct swap_cluster_info *" to > > reference the cluster rather than by using index. That is more > > consistent with the list manipulation. It avoids the repeat > > adding index to the cluser_info. The code is easier to understand. > > > > Remove the cluster next pointer is NULL flag, the double link > > list can handle the empty list pretty well. > > > > The "swap_cluster_info" struct is two pointer bigger, because > > 512 swap entries share one swap struct, it has very little impact > > on the average memory usage per swap entry. For 1TB swapfile, the > > swap cluster data structure increases from 8MB to 24MB. > > > > Other than the list conversion, there is no real function change > > in this patch. > > > > Signed-off-by: Chris Li <chrisl@xxxxxxxxxx> > > --- > > include/linux/swap.h | 26 +++--- > > mm/swapfile.c | 225 ++++++++++++++------------------------------------- > > 2 files changed, 70 insertions(+), 181 deletions(-) > > > > diff --git a/include/linux/swap.h b/include/linux/swap.h > > index e473fe6cfb7a..e9be95468fc7 100644 > > --- a/include/linux/swap.h > > +++ b/include/linux/swap.h > > @@ -243,22 +243,21 @@ enum { > > * free clusters are organized into a list. We fetch an entry from the list to > > * get a free cluster. > > * > > - * The data field stores next cluster if the cluster is free or cluster usage > > - * counter otherwise. The flags field determines if a cluster is free. This is > > - * protected by swap_info_struct.lock. > > + * The flags field determines if a cluster is free. This is > > + * protected by cluster lock. > > */ > > struct swap_cluster_info { > > spinlock_t lock; /* > > * Protect swap_cluster_info fields > > - * and swap_info_struct->swap_map > > - * elements correspond to the swap > > - * cluster > > + * other than list, and swap_info_struct->swap_map > > + * elements correspond to the swap cluster. > > nit: correspond -> corresponding Done. > > > */ > > - unsigned int data:24; > > - unsigned int flags:8; > > + u16 count; > > Just to make sure I've understood correctly; count can safely be 16 bit (down > from previous 24 bit) because the max it will ever be is the number of swap Yes. The count does not need to point to a cluster index any more. It just needs to hold whatever swap entries in the cluster. > entries in the cluster, and that's currently no bigger than 512, and in future > we wouldn't expect it to ever get bigger than 8192 (number of pages in PMD-order > for arm64 64K base pages). 8192 is represented in 14 bits. We can change the cluster bigger than the PMD order if we want to, we just need to make sure the count can hold the number of swap entries in a cluster. > > > + u8 flags; > > + struct list_head list; > > }; > > #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ > > -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ > > + > > nit: why the blank line? Good catch. Removed. > > > > > /* > > * The first page in the swap file is the swap header, which is always marked > > @@ -283,11 +282,6 @@ struct percpu_cluster { > > unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ > > }; > > > > -struct swap_cluster_list { > > - struct swap_cluster_info head; > > - struct swap_cluster_info tail; > > -}; > > - > > /* > > * The in-memory structure used to track swap areas. > > */ > > @@ -301,7 +295,7 @@ struct swap_info_struct { > > unsigned char *swap_map; /* vmalloc'ed array of usage counts */ > > unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > > struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > > - struct swap_cluster_list free_clusters; /* free clusters list */ > > + struct list_head free_clusters; /* free clusters list */ > > unsigned int lowest_bit; /* index of first free in swap_map */ > > unsigned int highest_bit; /* index of last free in swap_map */ > > unsigned int pages; /* total of usable pages of swap */ > > @@ -332,7 +326,7 @@ struct swap_info_struct { > > * list. > > */ > > struct work_struct discard_work; /* discard worker */ > > - struct swap_cluster_list discard_clusters; /* discard clusters list */ > > + struct list_head discard_clusters; /* discard clusters list */ > > struct plist_node avail_lists[]; /* > > * entries in swap_avail_heads, one > > * entry per node. > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index f7224bc1320c..f70d25005d2c 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -290,62 +290,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, > > #endif > > #define LATENCY_LIMIT 256 > > > > -static inline void cluster_set_flag(struct swap_cluster_info *info, > > - unsigned int flag) > > -{ > > - info->flags = flag; > > -} > > - > > -static inline unsigned int cluster_count(struct swap_cluster_info *info) > > -{ > > - return info->data; > > -} > > - > > -static inline void cluster_set_count(struct swap_cluster_info *info, > > - unsigned int c) > > -{ > > - info->data = c; > > -} > > - > > -static inline void cluster_set_count_flag(struct swap_cluster_info *info, > > - unsigned int c, unsigned int f) > > -{ > > - info->flags = f; > > - info->data = c; > > -} > > - > > -static inline unsigned int cluster_next(struct swap_cluster_info *info) > > -{ > > - return info->data; > > -} > > - > > -static inline void cluster_set_next(struct swap_cluster_info *info, > > - unsigned int n) > > -{ > > - info->data = n; > > -} > > - > > -static inline void cluster_set_next_flag(struct swap_cluster_info *info, > > - unsigned int n, unsigned int f) > > -{ > > - info->flags = f; > > - info->data = n; > > -} > > - > > static inline bool cluster_is_free(struct swap_cluster_info *info) > > { > > return info->flags & CLUSTER_FLAG_FREE; > > } > > > > -static inline bool cluster_is_null(struct swap_cluster_info *info) > > -{ > > - return info->flags & CLUSTER_FLAG_NEXT_NULL; > > -} > > - > > -static inline void cluster_set_null(struct swap_cluster_info *info) > > +static inline unsigned int cluster_index(struct swap_info_struct *si, > > + struct swap_cluster_info *ci) > > { > > - info->flags = CLUSTER_FLAG_NEXT_NULL; > > - info->data = 0; > > + return ci - si->cluster_info; > > } > > > > static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, > > @@ -394,65 +347,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, > > spin_unlock(&si->lock); > > } > > > > -static inline bool cluster_list_empty(struct swap_cluster_list *list) > > -{ > > - return cluster_is_null(&list->head); > > -} > > - > > -static inline unsigned int cluster_list_first(struct swap_cluster_list *list) > > -{ > > - return cluster_next(&list->head); > > -} > > - > > -static void cluster_list_init(struct swap_cluster_list *list) > > -{ > > - cluster_set_null(&list->head); > > - cluster_set_null(&list->tail); > > -} > > - > > -static void cluster_list_add_tail(struct swap_cluster_list *list, > > - struct swap_cluster_info *ci, > > - unsigned int idx) > > -{ > > - if (cluster_list_empty(list)) { > > - cluster_set_next_flag(&list->head, idx, 0); > > - cluster_set_next_flag(&list->tail, idx, 0); > > - } else { > > - struct swap_cluster_info *ci_tail; > > - unsigned int tail = cluster_next(&list->tail); > > - > > - /* > > - * Nested cluster lock, but both cluster locks are > > - * only acquired when we held swap_info_struct->lock > > - */ > > - ci_tail = ci + tail; > > - spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); > > Just to confirm my understanding, there was never previously any list > manipulation where the si->lock wasn't held, so this ensuring that both clusters > were locked for list manipulation was unnecessary? I don't see any extra places > where you are taking si->lock, so I guess that must be the case (or your new > regime is unsafe...). Yes, and some of my later patch assertions on si->lock are taken on those code paths. The reason for change is that adding the cluster to the list head will need to change more than just the current cluster. It will also change the cluster before and after that. So the cluster->lock protecting the cluster->list field can't be true any more. It is actually depending on the si->lock to protect the list head. > > > - cluster_set_next(ci_tail, idx); > > - spin_unlock(&ci_tail->lock); > > - cluster_set_next_flag(&list->tail, idx, 0); > > - } > > -} > > - > > -static unsigned int cluster_list_del_first(struct swap_cluster_list *list, > > - struct swap_cluster_info *ci) > > -{ > > - unsigned int idx; > > - > > - idx = cluster_next(&list->head); > > - if (cluster_next(&list->tail) == idx) { > > - cluster_set_null(&list->head); > > - cluster_set_null(&list->tail); > > - } else > > - cluster_set_next_flag(&list->head, > > - cluster_next(&ci[idx]), 0); > > - > > - return idx; > > -} > > - > > /* Add a cluster to discard list and schedule it to do discard */ > > static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > - unsigned int idx) > > + struct swap_cluster_info *ci) > > { > > + unsigned int idx = cluster_index(si, ci); > > /* > > * If scan_swap_map_slots() can't find a free cluster, it will check > > * si->swap_map directly. To make sure the discarding cluster isn't > > @@ -462,17 +361,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > > > > - cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); > > - > > + list_add_tail(&ci->list, &si->discard_clusters); > > schedule_work(&si->discard_work); > > } > > > > -static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > > +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) > > { > > - struct swap_cluster_info *ci = si->cluster_info; > > - > > - cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); > > - cluster_list_add_tail(&si->free_clusters, ci, idx); > > + ci->flags = CLUSTER_FLAG_FREE; > > + list_add_tail(&ci->list, &si->free_clusters); > > } > > > > /* > > @@ -481,24 +377,25 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) > > */ > > static void swap_do_scheduled_discard(struct swap_info_struct *si) > > { > > - struct swap_cluster_info *info, *ci; > > + struct swap_cluster_info *ci; > > unsigned int idx; > > > > - info = si->cluster_info; > > - > > - while (!cluster_list_empty(&si->discard_clusters)) { > > - idx = cluster_list_del_first(&si->discard_clusters, info); > > + while (!list_empty(&si->discard_clusters)) { > > + ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); > > + list_del(&ci->list); > > + idx = cluster_index(si, ci); > > spin_unlock(&si->lock); > > > > discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, > > SWAPFILE_CLUSTER); > > > > spin_lock(&si->lock); > > - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); > > - __free_cluster(si, idx); > > + > > + spin_lock(&ci->lock); > > + __free_cluster(si, ci); > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > 0, SWAPFILE_CLUSTER); > > - unlock_cluster(ci); > > + spin_unlock(&ci->lock); > > } > > } > > > > @@ -521,20 +418,20 @@ static void swap_users_ref_free(struct percpu_ref *ref) > > complete(&si->comp); > > } > > > > -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) > > +static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx) > > { > > - struct swap_cluster_info *ci = si->cluster_info; > > + struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > > nit: long line; consider separating the variable declaration and assignment? Done. > > > > > - VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); > > - cluster_list_del_first(&si->free_clusters, ci); > > - cluster_set_count_flag(ci + idx, 0, 0); > > + VM_BUG_ON(cluster_index(si, ci) != idx); > > + list_del(&ci->list); > > + ci->count = 0; > > + ci->flags = 0; > > + return ci; > > } > > > > -static void free_cluster(struct swap_info_struct *si, unsigned long idx) > > +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) > > { > > - struct swap_cluster_info *ci = si->cluster_info + idx; > > - > > - VM_BUG_ON(cluster_count(ci) != 0); > > + VM_BUG_ON(ci->count != 0); > > /* > > * If the swap is discardable, prepare discard the cluster > > * instead of free it immediately. The cluster will be freed > > @@ -542,11 +439,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) > > */ > > if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == > > (SWP_WRITEOK | SWP_PAGE_DISCARD)) { > > - swap_cluster_schedule_discard(si, idx); > > + swap_cluster_schedule_discard(si, ci); > > return; > > } > > > > - __free_cluster(si, idx); > > + __free_cluster(si, ci); > > } > > > > /* > > @@ -559,15 +456,15 @@ static void add_cluster_info_page(struct swap_info_struct *p, > > unsigned long count) > > { > > unsigned long idx = page_nr / SWAPFILE_CLUSTER; > > + struct swap_cluster_info *ci = cluster_info + idx; > > > > if (!cluster_info) > > return; > > - if (cluster_is_free(&cluster_info[idx])) > > + if (cluster_is_free(ci)) > > alloc_cluster(p, idx); > > > > - VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); > > - cluster_set_count(&cluster_info[idx], > > - cluster_count(&cluster_info[idx]) + count); > > + VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER); > > + ci->count += count; > > } > > > > /* > > @@ -581,24 +478,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p, > > } > > > > /* > > - * The cluster corresponding to page_nr decreases one usage. If the usage > > - * counter becomes 0, which means no page in the cluster is in using, we can > > - * optionally discard the cluster and add it to free cluster list. > > + * The cluster ci decreases one usage. If the usage counter becomes 0, > > + * which means no page in the cluster is in using, we can optionally discard > > nit: "in using" -> "in use" Done. BTW, the "in using" in from the original code before my patch :-) > > > + * the cluster and add it to free cluster list. > > */ > > -static void dec_cluster_info_page(struct swap_info_struct *p, > > - struct swap_cluster_info *cluster_info, unsigned long page_nr) > > +static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) > > { > > - unsigned long idx = page_nr / SWAPFILE_CLUSTER; > > - > > - if (!cluster_info) > > + if (!p->cluster_info) > > return; > > > > - VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); > > - cluster_set_count(&cluster_info[idx], > > - cluster_count(&cluster_info[idx]) - 1); > > + VM_BUG_ON(ci->count == 0); > > + ci->count--; > > > > - if (cluster_count(&cluster_info[idx]) == 0) > > - free_cluster(p, idx); > > + if (!ci->count) > > + free_cluster(p, ci); > > } > > > > /* > > @@ -611,10 +504,10 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, > > { > > struct percpu_cluster *percpu_cluster; > > bool conflict; > > - > > + struct swap_cluster_info *first = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > > nit: long line; consider splitting variable declaration and assignment. Done. > > nit: You're removed the blank line between variable declarations and statements; > did you run checkpatch.pl? Not yet. I will make sure I run it on the next revision. Thanks for the reminder. Chris