Hi Kairui, On 12/31/24 at 01:46am, Kairui Song wrote: ......snip... > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 7963a0c646a4..e6e58cfb5178 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent) > return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ > } I am reading swap code, while at it, I am going through this patchset too. Have some nitpick, please see below inline comments. > > +/* > + * Use the second highest bit of inuse_pages counter as the indicator > + * of if one swap device is on the available plist, so the atomic can ~~ redundant? > + * still be updated arithmetic while having special data embedded. ~~~~~~~~~~ typo, arithmetically? > + * > + * inuse_pages counter is the only thing indicating if a device should > + * be on avail_lists or not (except swapon / swapoff). By embedding the > + * on-list bit in the atomic counter, updates no longer need any lock ~~~ off-list? > + * to check the list status. > + * > + * This bit will be set if the device is not on the plist and not > + * usable, will be cleared if the device is on the plist. > + */ > +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) > +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) > +static long swap_usage_in_pages(struct swap_info_struct *si) > +{ > + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; > +} > + > /* Reclaim the swap entry anyway if possible */ > #define TTRS_ANYWAY 0x1 > /* > @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) > int nr_reclaim; > > if (force) > - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; > + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; > > while (!list_empty(&si->full_clusters)) { > ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); > @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > return found; > } > > -static void __del_from_avail_list(struct swap_info_struct *si) > +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ Seems it just says the opposite. The off-list bit is set in this function. > +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) > { > int nid; > + unsigned long pages; > + > + spin_lock(&swap_avail_lock); > + > + if (swapoff) { > + /* > + * Forcefully remove it. Clear the SWP_WRITEOK flags for > + * swapoff here so it's synchronized by both si->lock and > + * swap_avail_lock, to ensure the result can be seen by > + * add_to_avail_list. > + */ > + lockdep_assert_held(&si->lock); > + si->flags &= ~SWP_WRITEOK; > + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); > + } else { > + /* > + * If not called by swapoff, take it off-list only if it's > + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly > + * si->inuse_pages == pages), any concurrent slot freeing, > + * or device already removed from plist by someone else > + * will make this return false. > + */ > + pages = si->pages; > + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, > + pages | SWAP_USAGE_OFFLIST_BIT)) > + goto skip; > + } > > - assert_spin_locked(&si->lock); > for_each_node(nid) > plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); > + > +skip: > + spin_unlock(&swap_avail_lock); > } > > -static void del_from_avail_list(struct swap_info_struct *si) > +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ Ditto. > +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) > { > + int nid; > + long val; > + unsigned long pages; > + > spin_lock(&swap_avail_lock); > - __del_from_avail_list(si); > + > + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ > + if (swapon) { > + lockdep_assert_held(&si->lock); > + si->flags |= SWP_WRITEOK; > + } else { > + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) > + goto skip; > + } > + > + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) > + goto skip; > + > + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); > + > + /* > + * When device is full and device is on the plist, only one updater will > + * see (inuse_pages == si->pages) and will call del_from_avail_list. If > + * that updater happen to be here, just skip adding. > + */ > + pages = si->pages; > + if (val == pages) { > + /* Just like the cmpxchg in del_from_avail_list */ > + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, > + pages | SWAP_USAGE_OFFLIST_BIT)) > + goto skip; > + } > + > + for_each_node(nid) > + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); > + > +skip: > spin_unlock(&swap_avail_lock); > } > > -static void swap_range_alloc(struct swap_info_struct *si, > - unsigned int nr_entries) > +/* > + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub() is better, because it mixes with the usage of si->swap_map[offset]. Anyway, not strong opinion. > + * within each cluster, so the total contribution to the global counter should > + * always be positive and cannot exceed the total number of usable slots. > + */ > +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) > { > - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); > - if (si->inuse_pages == si->pages) { > - del_from_avail_list(si); > + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); > > - if (si->cluster_info && vm_swap_full()) > - schedule_work(&si->reclaim_work); > + /* > + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, > + * remove it from the plist. > + */ > + if (unlikely(val == si->pages)) { > + del_from_avail_list(si, false); > + return true; > } > + > + return false; > } > > -static void add_to_avail_list(struct swap_info_struct *si) > +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) > { > - int nid; > + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); > > - spin_lock(&swap_avail_lock); > - for_each_node(nid) > - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); > - spin_unlock(&swap_avail_lock); > + /* > + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, > + * remove it from the plist. > + */ > + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) > + add_to_avail_list(si, false); > +} > + > +static void swap_range_alloc(struct swap_info_struct *si, > + unsigned int nr_entries) > +{ > + if (swap_usage_add(si, nr_entries)) { > + if (si->cluster_info && vm_swap_full()) We may not need check si->cluster_info here since it always exists now. > + schedule_work(&si->reclaim_work); > + } > } > > static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > for (i = 0; i < nr_entries; i++) > clear_bit(offset + i, si->zeromap); > > - if (si->inuse_pages == si->pages) > - add_to_avail_list(si); > if (si->flags & SWP_BLKDEV) > swap_slot_free_notify = > si->bdev->bd_disk->fops->swap_slot_free_notify; > @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > */ > smp_wmb(); > atomic_long_add(nr_entries, &nr_swap_pages); > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > + swap_usage_sub(si, nr_entries); > } > > static int cluster_alloc_swap(struct swap_info_struct *si, > @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); > spin_unlock(&swap_avail_lock); > spin_lock(&si->lock); > - if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) { > - spin_lock(&swap_avail_lock); > - if (plist_node_empty(&si->avail_lists[node])) { > - spin_unlock(&si->lock); > - goto nextsi; > - } > - WARN(!(si->flags & SWP_WRITEOK), > - "swap_info %d in list but !SWP_WRITEOK\n", > - si->type); > - __del_from_avail_list(si); > - spin_unlock(&si->lock); > - goto nextsi; > - } > n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, > n_goal, swp_entries, order); > spin_unlock(&si->lock); > @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > cond_resched(); > > spin_lock(&swap_avail_lock); > -nextsi: > /* > * if we got here, it's likely that si was almost full before, > * and since scan_swap_map_slots() can drop the si->lock, > @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free) > if (sis->flags & SWP_WRITEOK) { > n = sis->pages; > if (free) > - n -= sis->inuse_pages; > + n -= swap_usage_in_pages(sis); > } > spin_unlock(&sis->lock); > } > @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type) > swp_entry_t entry; > unsigned int i; > > - if (!READ_ONCE(si->inuse_pages)) > + if (!swap_usage_in_pages(si)) > goto success; > > retry: > @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type) > > spin_lock(&mmlist_lock); > p = &init_mm.mmlist; > - while (READ_ONCE(si->inuse_pages) && > + while (swap_usage_in_pages(si) && > !signal_pending(current) && > (p = p->next) != &init_mm.mmlist) { > > @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type) > mmput(prev_mm); > > i = 0; > - while (READ_ONCE(si->inuse_pages) && > + while (swap_usage_in_pages(si) && > !signal_pending(current) && > (i = find_next_to_unuse(si, i)) != 0) { > > @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type) > * folio_alloc_swap(), temporarily hiding that swap. It's easy > * and robust (though cpu-intensive) just to keep retrying. > */ > - if (READ_ONCE(si->inuse_pages)) { > + if (swap_usage_in_pages(si)) { > if (!signal_pending(current)) > goto retry; > return -EINTR; > @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type) > success: > /* > * Make sure that further cleanups after try_to_unuse() returns happen > - * after swap_range_free() reduces si->inuse_pages to 0. > + * after swap_range_free() reduces inuse_pages to 0. Here, I personally think the original si->inuse_pages may be better. > */ > smp_mb(); > return 0; > @@ -2227,7 +2317,7 @@ static void drain_mmlist(void) > unsigned int type; > > for (type = 0; type < nr_swapfiles; type++) > - if (swap_info[type]->inuse_pages) > + if (swap_usage_in_pages(swap_info[type])) > return; > spin_lock(&mmlist_lock); > list_for_each_safe(p, next, &init_mm.mmlist) > @@ -2406,7 +2496,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, > > static void _enable_swap_info(struct swap_info_struct *si) > { > - si->flags |= SWP_WRITEOK; > atomic_long_add(si->pages, &nr_swap_pages); > total_swap_pages += si->pages; > > @@ -2423,9 +2512,8 @@ static void _enable_swap_info(struct swap_info_struct *si) > */ > plist_add(&si->list, &swap_active_head); > > - /* add to available list if swap device is not full */ > - if (si->inuse_pages < si->pages) > - add_to_avail_list(si); > + /* Add back to available list */ > + add_to_avail_list(si, true); > } > > static void enable_swap_info(struct swap_info_struct *si, int prio, > @@ -2523,7 +2611,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > goto out_dput; > } > spin_lock(&p->lock); > - del_from_avail_list(p); > + del_from_avail_list(p, true); > if (p->prio < 0) { > struct swap_info_struct *si = p; > int nid; > @@ -2541,7 +2629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > plist_del(&p->list, &swap_active_head); > atomic_long_sub(p->pages, &nr_swap_pages); > total_swap_pages -= p->pages; > - p->flags &= ~SWP_WRITEOK; > spin_unlock(&p->lock); > spin_unlock(&swap_lock); > > @@ -2721,7 +2808,7 @@ static int swap_show(struct seq_file *swap, void *v) > } > > bytes = K(si->pages); > - inuse = K(READ_ONCE(si->inuse_pages)); > + inuse = K(swap_usage_in_pages(si)); > > file = si->swap_file; > len = seq_file_path(swap, file, " \t\n\\"); > @@ -2838,6 +2925,7 @@ static struct swap_info_struct *alloc_swap_info(void) > } > spin_lock_init(&p->lock); > spin_lock_init(&p->cont_lock); > + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); > init_completion(&p->comp); > > return p; > @@ -3335,7 +3423,7 @@ void si_swapinfo(struct sysinfo *val) > struct swap_info_struct *si = swap_info[type]; > > if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) > - nr_to_be_unused += READ_ONCE(si->inuse_pages); > + nr_to_be_unused += swap_usage_in_pages(si); > } > val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; > val->totalswap = total_swap_pages + nr_to_be_unused; > -- > 2.47.1 > >