Re: [PATCH v3 06/13] mm, swap: clean up plist removal and adding

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Kairui,

On 12/31/24 at 01:46am, Kairui Song wrote:
......snip...
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 7963a0c646a4..e6e58cfb5178 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
>  	return ent & ~SWAP_HAS_CACHE;	/* may include COUNT_CONTINUED flag */
>  }

I am reading swap code, while at it, I am going through this patchset
too. Have some nitpick, please see below inline comments.

>  
> +/*
> + * Use the second highest bit of inuse_pages counter as the indicator
> + * of if one swap device is on the available plist, so the atomic can
      ~~ redundant?
> + * still be updated arithmetic while having special data embedded.
                       ~~~~~~~~~~ typo, arithmetically?
> + *
> + * inuse_pages counter is the only thing indicating if a device should
> + * be on avail_lists or not (except swapon / swapoff). By embedding the
> + * on-list bit in the atomic counter, updates no longer need any lock
      ~~~ off-list?
> + * to check the list status.
> + *
> + * This bit will be set if the device is not on the plist and not
> + * usable, will be cleared if the device is on the plist.
> + */
> +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
> +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
> +static long swap_usage_in_pages(struct swap_info_struct *si)
> +{
> +	return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
> +}
> +
>  /* Reclaim the swap entry anyway if possible */
>  #define TTRS_ANYWAY		0x1
>  /*
> @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
>  	int nr_reclaim;
>  
>  	if (force)
> -		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
> +		to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
>  
>  	while (!list_empty(&si->full_clusters)) {
>  		ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
> @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>  	return found;
>  }
>  
> -static void __del_from_avail_list(struct swap_info_struct *si)
> +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
       Seems it just says the opposite. The off-list bit is set in
       this function. 
> +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>  {
>  	int nid;
> +	unsigned long pages;
> +
> +	spin_lock(&swap_avail_lock);
> +
> +	if (swapoff) {
> +		/*
> +		 * Forcefully remove it. Clear the SWP_WRITEOK flags for
> +		 * swapoff here so it's synchronized by both si->lock and
> +		 * swap_avail_lock, to ensure the result can be seen by
> +		 * add_to_avail_list.
> +		 */
> +		lockdep_assert_held(&si->lock);
> +		si->flags &= ~SWP_WRITEOK;
> +		atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> +	} else {
> +		/*
> +		 * If not called by swapoff, take it off-list only if it's
> +		 * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
> +		 * si->inuse_pages == pages), any concurrent slot freeing,
> +		 * or device already removed from plist by someone else
> +		 * will make this return false.
> +		 */
> +		pages = si->pages;
> +		if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> +					     pages | SWAP_USAGE_OFFLIST_BIT))
> +			goto skip;
> +	}
>  
> -	assert_spin_locked(&si->lock);
>  	for_each_node(nid)
>  		plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> +
> +skip:
> +	spin_unlock(&swap_avail_lock);
>  }
>  
> -static void del_from_avail_list(struct swap_info_struct *si)
> +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */

  Ditto.

> +static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>  {
> +	int nid;
> +	long val;
> +	unsigned long pages;
> +
>  	spin_lock(&swap_avail_lock);
> -	__del_from_avail_list(si);
> +
> +	/* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
> +	if (swapon) {
> +		lockdep_assert_held(&si->lock);
> +		si->flags |= SWP_WRITEOK;
> +	} else {
> +		if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
> +			goto skip;
> +	}
> +
> +	if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
> +		goto skip;
> +
> +	val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> +
> +	/*
> +	 * When device is full and device is on the plist, only one updater will
> +	 * see (inuse_pages == si->pages) and will call del_from_avail_list. If
> +	 * that updater happen to be here, just skip adding.
> +	 */
> +	pages = si->pages;
> +	if (val == pages) {
> +		/* Just like the cmpxchg in del_from_avail_list */
> +		if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> +					    pages | SWAP_USAGE_OFFLIST_BIT))
> +			goto skip;
> +	}
> +
> +	for_each_node(nid)
> +		plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> +
> +skip:
>  	spin_unlock(&swap_avail_lock);
>  }
>  
> -static void swap_range_alloc(struct swap_info_struct *si,
> -			     unsigned int nr_entries)
> +/*
> + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock

Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub()
is better, because it mixes with the usage of si->swap_map[offset].
Anyway, not strong opinion.

> + * within each cluster, so the total contribution to the global counter should
> + * always be positive and cannot exceed the total number of usable slots.
> + */
> +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
>  {
> -	WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
> -	if (si->inuse_pages == si->pages) {
> -		del_from_avail_list(si);
> +	long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
>  
> -		if (si->cluster_info && vm_swap_full())
> -			schedule_work(&si->reclaim_work);
> +	/*
> +	 * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
> +	 * remove it from the plist.
> +	 */
> +	if (unlikely(val == si->pages)) {
> +		del_from_avail_list(si, false);
> +		return true;
>  	}
> +
> +	return false;
>  }
>  
> -static void add_to_avail_list(struct swap_info_struct *si)
> +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
>  {
> -	int nid;
> +	long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
>  
> -	spin_lock(&swap_avail_lock);
> -	for_each_node(nid)
> -		plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> -	spin_unlock(&swap_avail_lock);
> +	/*
> +	 * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
> +	 * remove it from the plist.
> +	 */
> +	if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
> +		add_to_avail_list(si, false);
> +}
> +
> +static void swap_range_alloc(struct swap_info_struct *si,
> +			     unsigned int nr_entries)
> +{
> +	if (swap_usage_add(si, nr_entries)) {
> +		if (si->cluster_info && vm_swap_full())

We may not need check si->cluster_info here since it always exists now.
                    
> +			schedule_work(&si->reclaim_work);
> +	}
>  }
>  
>  static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
>  	for (i = 0; i < nr_entries; i++)
>  		clear_bit(offset + i, si->zeromap);
>  
> -	if (si->inuse_pages == si->pages)
> -		add_to_avail_list(si);
>  	if (si->flags & SWP_BLKDEV)
>  		swap_slot_free_notify =
>  			si->bdev->bd_disk->fops->swap_slot_free_notify;
> @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
>  	 */
>  	smp_wmb();
>  	atomic_long_add(nr_entries, &nr_swap_pages);
> -	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
> +	swap_usage_sub(si, nr_entries);
>  }
>  
>  static int cluster_alloc_swap(struct swap_info_struct *si,
> @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
>  		spin_unlock(&swap_avail_lock);
>  		spin_lock(&si->lock);
> -		if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
> -			spin_lock(&swap_avail_lock);
> -			if (plist_node_empty(&si->avail_lists[node])) {
> -				spin_unlock(&si->lock);
> -				goto nextsi;
> -			}
> -			WARN(!(si->flags & SWP_WRITEOK),
> -			     "swap_info %d in list but !SWP_WRITEOK\n",
> -			     si->type);
> -			__del_from_avail_list(si);
> -			spin_unlock(&si->lock);
> -			goto nextsi;
> -		}
>  		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
>  					    n_goal, swp_entries, order);
>  		spin_unlock(&si->lock);
> @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  		cond_resched();
>  
>  		spin_lock(&swap_avail_lock);
> -nextsi:
>  		/*
>  		 * if we got here, it's likely that si was almost full before,
>  		 * and since scan_swap_map_slots() can drop the si->lock,
> @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
>  		if (sis->flags & SWP_WRITEOK) {
>  			n = sis->pages;
>  			if (free)
> -				n -= sis->inuse_pages;
> +				n -= swap_usage_in_pages(sis);
>  		}
>  		spin_unlock(&sis->lock);
>  	}
> @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
>  	swp_entry_t entry;
>  	unsigned int i;
>  
> -	if (!READ_ONCE(si->inuse_pages))
> +	if (!swap_usage_in_pages(si))
>  		goto success;
>  
>  retry:
> @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
>  
>  	spin_lock(&mmlist_lock);
>  	p = &init_mm.mmlist;
> -	while (READ_ONCE(si->inuse_pages) &&
> +	while (swap_usage_in_pages(si) &&
>  	       !signal_pending(current) &&
>  	       (p = p->next) != &init_mm.mmlist) {
>  
> @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
>  	mmput(prev_mm);
>  
>  	i = 0;
> -	while (READ_ONCE(si->inuse_pages) &&
> +	while (swap_usage_in_pages(si) &&
>  	       !signal_pending(current) &&
>  	       (i = find_next_to_unuse(si, i)) != 0) {
>  
> @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
>  	 * folio_alloc_swap(), temporarily hiding that swap.  It's easy
>  	 * and robust (though cpu-intensive) just to keep retrying.
>  	 */
> -	if (READ_ONCE(si->inuse_pages)) {
> +	if (swap_usage_in_pages(si)) {
>  		if (!signal_pending(current))
>  			goto retry;
>  		return -EINTR;
> @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
>  success:
>  	/*
>  	 * Make sure that further cleanups after try_to_unuse() returns happen
> -	 * after swap_range_free() reduces si->inuse_pages to 0.
> +	 * after swap_range_free() reduces inuse_pages to 0.

Here, I personally think the original si->inuse_pages may be better.

>  	 */
>  	smp_mb();
>  	return 0;
> @@ -2227,7 +2317,7 @@ static void drain_mmlist(void)
>  	unsigned int type;
>  
>  	for (type = 0; type < nr_swapfiles; type++)
> -		if (swap_info[type]->inuse_pages)
> +		if (swap_usage_in_pages(swap_info[type]))
>  			return;
>  	spin_lock(&mmlist_lock);
>  	list_for_each_safe(p, next, &init_mm.mmlist)
> @@ -2406,7 +2496,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
>  
>  static void _enable_swap_info(struct swap_info_struct *si)
>  {
> -	si->flags |= SWP_WRITEOK;
>  	atomic_long_add(si->pages, &nr_swap_pages);
>  	total_swap_pages += si->pages;
>  
> @@ -2423,9 +2512,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
>  	 */
>  	plist_add(&si->list, &swap_active_head);
>  
> -	/* add to available list if swap device is not full */
> -	if (si->inuse_pages < si->pages)
> -		add_to_avail_list(si);
> +	/* Add back to available list */
> +	add_to_avail_list(si, true);
>  }
>  
>  static void enable_swap_info(struct swap_info_struct *si, int prio,
> @@ -2523,7 +2611,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  		goto out_dput;
>  	}
>  	spin_lock(&p->lock);
> -	del_from_avail_list(p);
> +	del_from_avail_list(p, true);
>  	if (p->prio < 0) {
>  		struct swap_info_struct *si = p;
>  		int nid;
> @@ -2541,7 +2629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	plist_del(&p->list, &swap_active_head);
>  	atomic_long_sub(p->pages, &nr_swap_pages);
>  	total_swap_pages -= p->pages;
> -	p->flags &= ~SWP_WRITEOK;
>  	spin_unlock(&p->lock);
>  	spin_unlock(&swap_lock);
>  
> @@ -2721,7 +2808,7 @@ static int swap_show(struct seq_file *swap, void *v)
>  	}
>  
>  	bytes = K(si->pages);
> -	inuse = K(READ_ONCE(si->inuse_pages));
> +	inuse = K(swap_usage_in_pages(si));
>  
>  	file = si->swap_file;
>  	len = seq_file_path(swap, file, " \t\n\\");
> @@ -2838,6 +2925,7 @@ static struct swap_info_struct *alloc_swap_info(void)
>  	}
>  	spin_lock_init(&p->lock);
>  	spin_lock_init(&p->cont_lock);
> +	atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
>  	init_completion(&p->comp);
>  
>  	return p;
> @@ -3335,7 +3423,7 @@ void si_swapinfo(struct sysinfo *val)
>  		struct swap_info_struct *si = swap_info[type];
>  
>  		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> -			nr_to_be_unused += READ_ONCE(si->inuse_pages);
> +			nr_to_be_unused += swap_usage_in_pages(si);
>  	}
>  	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
>  	val->totalswap = total_swap_pages + nr_to_be_unused;
> -- 
> 2.47.1
> 
> 





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux