On Thu, Jan 2, 2025 at 4:59 PM Baoquan He <bhe@xxxxxxxxxx> wrote: > > Hi Kairui, > > On 12/31/24 at 01:46am, Kairui Song wrote: > ......snip... > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index 7963a0c646a4..e6e58cfb5178 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent) > > return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ > > } > > I am reading swap code, while at it, I am going through this patchset > too. Have some nitpick, please see below inline comments. Thanks! > > > > +/* > > + * Use the second highest bit of inuse_pages counter as the indicator > > + * of if one swap device is on the available plist, so the atomic can > ~~ redundant? > > + * still be updated arithmetic while having special data embedded. > ~~~~~~~~~~ typo, arithmetically? > > + * > > + * inuse_pages counter is the only thing indicating if a device should > > + * be on avail_lists or not (except swapon / swapoff). By embedding the > > + * on-list bit in the atomic counter, updates no longer need any lock > ~~~ off-list? Ah, right, some typos, will fix these. > > + * to check the list status. > > + * > > + * This bit will be set if the device is not on the plist and not > > + * usable, will be cleared if the device is on the plist. > > + */ > > +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) > > +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) > > +static long swap_usage_in_pages(struct swap_info_struct *si) > > +{ > > + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; > > +} > > + > > /* Reclaim the swap entry anyway if possible */ > > #define TTRS_ANYWAY 0x1 > > /* > > @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) > > int nr_reclaim; > > > > if (force) > > - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; > > + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; > > > > while (!list_empty(&si->full_clusters)) { > > ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); > > @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > > return found; > > } > > > > -static void __del_from_avail_list(struct swap_info_struct *si) > > +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ > Seems it just says the opposite. The off-list bit is set in > this function. Right, the comments are opposite... will fix them. > > +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) > > { > > int nid; > > + unsigned long pages; > > + > > + spin_lock(&swap_avail_lock); > > + > > + if (swapoff) { > > + /* > > + * Forcefully remove it. Clear the SWP_WRITEOK flags for > > + * swapoff here so it's synchronized by both si->lock and > > + * swap_avail_lock, to ensure the result can be seen by > > + * add_to_avail_list. > > + */ > > + lockdep_assert_held(&si->lock); > > + si->flags &= ~SWP_WRITEOK; > > + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); > > + } else { > > + /* > > + * If not called by swapoff, take it off-list only if it's > > + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly > > + * si->inuse_pages == pages), any concurrent slot freeing, > > + * or device already removed from plist by someone else > > + * will make this return false. > > + */ > > + pages = si->pages; > > + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, > > + pages | SWAP_USAGE_OFFLIST_BIT)) > > + goto skip; > > + } > > > > - assert_spin_locked(&si->lock); > > for_each_node(nid) > > plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); > > + > > +skip: > > + spin_unlock(&swap_avail_lock); > > } > > > > -static void del_from_avail_list(struct swap_info_struct *si) > > +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ > > Ditto. > > > +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) > > { > > + int nid; > > + long val; > > + unsigned long pages; > > + > > spin_lock(&swap_avail_lock); > > - __del_from_avail_list(si); > > + > > + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ > > + if (swapon) { > > + lockdep_assert_held(&si->lock); > > + si->flags |= SWP_WRITEOK; > > + } else { > > + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) > > + goto skip; > > + } > > + > > + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) > > + goto skip; > > + > > + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); > > + > > + /* > > + * When device is full and device is on the plist, only one updater will > > + * see (inuse_pages == si->pages) and will call del_from_avail_list. If > > + * that updater happen to be here, just skip adding. > > + */ > > + pages = si->pages; > > + if (val == pages) { > > + /* Just like the cmpxchg in del_from_avail_list */ > > + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, > > + pages | SWAP_USAGE_OFFLIST_BIT)) > > + goto skip; > > + } > > + > > + for_each_node(nid) > > + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); > > + > > +skip: > > spin_unlock(&swap_avail_lock); > > } > > > > -static void swap_range_alloc(struct swap_info_struct *si, > > - unsigned int nr_entries) > > +/* > > + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock > > Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub() > is better, because it mixes with the usage of si->swap_map[offset]. > Anyway, not strong opinion. > > > + * within each cluster, so the total contribution to the global counter should > > + * always be positive and cannot exceed the total number of usable slots. > > + */ > > +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) > > { > > - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); > > - if (si->inuse_pages == si->pages) { > > - del_from_avail_list(si); > > + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); > > > > - if (si->cluster_info && vm_swap_full()) > > - schedule_work(&si->reclaim_work); > > + /* > > + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, > > + * remove it from the plist. > > + */ > > + if (unlikely(val == si->pages)) { > > + del_from_avail_list(si, false); > > + return true; > > } > > + > > + return false; > > } > > > > -static void add_to_avail_list(struct swap_info_struct *si) > > +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) > > { > > - int nid; > > + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); > > > > - spin_lock(&swap_avail_lock); > > - for_each_node(nid) > > - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); > > - spin_unlock(&swap_avail_lock); > > + /* > > + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, > > + * remove it from the plist. > > + */ > > + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) > > + add_to_avail_list(si, false); > > +} > > + > > +static void swap_range_alloc(struct swap_info_struct *si, > > + unsigned int nr_entries) > > +{ > > + if (swap_usage_add(si, nr_entries)) { > > + if (si->cluster_info && vm_swap_full()) > > We may not need check si->cluster_info here since it always exists now. Good catch, it can be dropped indeed as an optimization, one previous patch in this series is supposed to drop them all, I think I forgot this one. > > > + schedule_work(&si->reclaim_work); > > + } > > } > > > > static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > > @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > > for (i = 0; i < nr_entries; i++) > > clear_bit(offset + i, si->zeromap); > > > > - if (si->inuse_pages == si->pages) > > - add_to_avail_list(si); > > if (si->flags & SWP_BLKDEV) > > swap_slot_free_notify = > > si->bdev->bd_disk->fops->swap_slot_free_notify; > > @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > > */ > > smp_wmb(); > > atomic_long_add(nr_entries, &nr_swap_pages); > > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > > + swap_usage_sub(si, nr_entries); > > } > > > > static int cluster_alloc_swap(struct swap_info_struct *si, > > @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > > plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); > > spin_unlock(&swap_avail_lock); > > spin_lock(&si->lock); > > - if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) { > > - spin_lock(&swap_avail_lock); > > - if (plist_node_empty(&si->avail_lists[node])) { > > - spin_unlock(&si->lock); > > - goto nextsi; > > - } > > - WARN(!(si->flags & SWP_WRITEOK), > > - "swap_info %d in list but !SWP_WRITEOK\n", > > - si->type); > > - __del_from_avail_list(si); > > - spin_unlock(&si->lock); > > - goto nextsi; > > - } > > n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, > > n_goal, swp_entries, order); > > spin_unlock(&si->lock); > > @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > > cond_resched(); > > > > spin_lock(&swap_avail_lock); > > -nextsi: > > /* > > * if we got here, it's likely that si was almost full before, > > * and since scan_swap_map_slots() can drop the si->lock, > > @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free) > > if (sis->flags & SWP_WRITEOK) { > > n = sis->pages; > > if (free) > > - n -= sis->inuse_pages; > > + n -= swap_usage_in_pages(sis); > > } > > spin_unlock(&sis->lock); > > } > > @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type) > > swp_entry_t entry; > > unsigned int i; > > > > - if (!READ_ONCE(si->inuse_pages)) > > + if (!swap_usage_in_pages(si)) > > goto success; > > > > retry: > > @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type) > > > > spin_lock(&mmlist_lock); > > p = &init_mm.mmlist; > > - while (READ_ONCE(si->inuse_pages) && > > + while (swap_usage_in_pages(si) && > > !signal_pending(current) && > > (p = p->next) != &init_mm.mmlist) { > > > > @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type) > > mmput(prev_mm); > > > > i = 0; > > - while (READ_ONCE(si->inuse_pages) && > > + while (swap_usage_in_pages(si) && > > !signal_pending(current) && > > (i = find_next_to_unuse(si, i)) != 0) { > > > > @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type) > > * folio_alloc_swap(), temporarily hiding that swap. It's easy > > * and robust (though cpu-intensive) just to keep retrying. > > */ > > - if (READ_ONCE(si->inuse_pages)) { > > + if (swap_usage_in_pages(si)) { > > if (!signal_pending(current)) > > goto retry; > > return -EINTR; > > @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type) > > success: > > /* > > * Make sure that further cleanups after try_to_unuse() returns happen > > - * after swap_range_free() reduces si->inuse_pages to 0. > > + * after swap_range_free() reduces inuse_pages to 0. > > Here, I personally think the original si->inuse_pages may be better. I updated this comment to avoid people from mis-using it directly, anyway it's a trivial comment, can keep it unchanged.