Re: [PATCH -mm] mm, swap: Fix race between swapoff and some swap operations

Minchan Kim <minchan@xxxxxxxxxx> · Fri, 8 Dec 2017 17:26:44 +0900

On Fri, Dec 08, 2017 at 01:41:10PM +0800, Huang, Ying wrote:
> Minchan Kim <minchan@xxxxxxxxxx> writes:
> 
> > On Thu, Dec 07, 2017 at 04:29:37PM -0800, Andrew Morton wrote:
> >> On Thu,  7 Dec 2017 09:14:26 +0800 "Huang, Ying" <ying.huang@xxxxxxxxx> wrote:
> >> 
> >> > When the swapin is performed, after getting the swap entry information
> >> > from the page table, the PTL (page table lock) will be released, then
> >> > system will go to swap in the swap entry, without any lock held to
> >> > prevent the swap device from being swapoff.  This may cause the race
> >> > like below,
> >> > 
> >> > CPU 1				CPU 2
> >> > -----				-----
> >> > 				do_swap_page
> >> > 				  swapin_readahead
> >> > 				    __read_swap_cache_async
> >> > swapoff				      swapcache_prepare
> >> >   p->swap_map = NULL		        __swap_duplicate
> >> > 					  p->swap_map[?] /* !!! NULL pointer access */
> >> > 
> >> > Because swap off is usually done when system shutdown only, the race
> >> > may not hit many people in practice.  But it is still a race need to
> >> > be fixed.
> >> 
> >> swapoff is so rare that it's hard to get motivated about any fix which
> >> adds overhead to the regular codepaths.
> >
> > That was my concern, too when I see this patch.
> >
> >> 
> >> Is there something we can do to ensure that all the overhead of this
> >> fix is placed into the swapoff side?  stop_machine() may be a bit
> >> brutal, but a surprising amount of code uses it.  Any other ideas?
> >
> > How about this?
> >
> > I think It's same approach with old where we uses si->lock everywhere
> > instead of more fine-grained cluster lock.
> >
> > The reason I repeated to reset p->max to zero in the loop is to avoid
> > using lockdep annotation(maybe, spin_lock_nested(something) to prevent
> > false positive.
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 42fe5653814a..9ce007a42bbc 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -2644,6 +2644,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >  	swap_file = p->swap_file;
> >  	old_block_size = p->old_block_size;
> >  	p->swap_file = NULL;
> > +
> > +	if (p->flags & SWP_SOLIDSTATE) {
> > +		unsigned long ci, nr_cluster;
> > +
> > +		nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
> > +		for (ci = 0; ci < nr_cluster; ci++) {
> > +			struct swap_cluster_info *sci;
> > +
> > +			sci = lock_cluster(p, ci * SWAPFILE_CLUSTER);
> > +			p->max = 0;
> > +			unlock_cluster(sci);
> > +		}
> > +	}
> >  	p->max = 0;
> >  	swap_map = p->swap_map;
> >  	p->swap_map = NULL;
> > @@ -3369,10 +3382,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> >  		goto bad_file;
> >  	p = swap_info[type];
> >  	offset = swp_offset(entry);
> > -	if (unlikely(offset >= p->max))
> > -		goto out;
> >  
> >  	ci = lock_cluster_or_swap_info(p, offset);
> > +	if (unlikely(offset >= p->max))
> > +		goto unlock_out;
> >  
> >  	count = p->swap_map[offset];
> >  
> 
> Sorry, this doesn't work, because
> 
> lock_cluster_or_swap_info()
> 
> Need to read p->cluster_info, which may be freed during swapoff too.
> 
> 
> To reduce the added overhead in regular code path, Maybe we can use SRCU
> to implement get_swap_device() and put_swap_device()?  There is only
> increment/decrement on CPU local variable in srcu_read_lock/unlock().
> Should be acceptable in not so hot swap path?
> 
> This needs to select CONFIG_SRCU if CONFIG_SWAP is enabled.  But I guess
> that should be acceptable too?
> 

Why do we need srcu here? Is it enough with rcu like below?

It might have a bug/room to be optimized about performance/naming.
I just wanted to show my intention.

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2417d288e016..bfe493f3bcb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -273,6 +273,7 @@ struct swap_info_struct {
 					 */
 	struct work_struct discard_work; /* discard worker */
 	struct swap_cluster_list discard_clusters; /* discard clusters list */
+	struct rcu_head	rcu;
 };
 
 #ifdef CONFIG_64BIT
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42fe5653814a..ecec064f9b20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -302,6 +302,7 @@ static inline struct swap_cluster_info *lock_cluster_or_swap_info(
 {
 	struct swap_cluster_info *ci;
 
+	rcu_read_lock();
 	ci = lock_cluster(si, offset);
 	if (!ci)
 		spin_lock(&si->lock);
@@ -316,6 +317,7 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 		unlock_cluster(ci);
 	else
 		spin_unlock(&si->lock);
+	rcu_read_unlock();
 }
 
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
@@ -2530,11 +2532,17 @@ bool has_usable_swap(void)
 	return ret;
 }
 
+static void swap_cluster_info_free(struct rcu_head *rcu)
+{
+	struct swap_info_struct *si = container_of(rcu, struct swap_info_struct, rcu);
+	kvfree(si->cluster_info);
+	si->cluster_info = NULL;
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
 	unsigned char *swap_map;
-	struct swap_cluster_info *cluster_info;
 	unsigned long *frontswap_map;
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
@@ -2542,6 +2550,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct filename *pathname;
 	int err, found = 0;
 	unsigned int old_block_size;
+	unsigned long ci, nr_cluster;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2631,6 +2640,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	spin_lock(&p->lock);
 	drain_mmlist();
 
+	nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
 	/* wait for anyone still in scan_swap_map */
 	p->highest_bit = 0;		/* cuts scans short */
 	while (p->flags >= SWP_SCANNING) {
@@ -2641,14 +2651,33 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		spin_lock(&p->lock);
 	}
 
+	if (p->flags & SWP_SOLIDSTATE) {
+		struct swap_cluster_info *sci, *head_cluster;
+
+		head_cluster = p->cluster_info;
+		spin_lock(&head_cluster->lock);
+		sci = head_cluster + 1;
+		for (ci = 1; ci < nr_cluster; ci++, sci++)
+			spin_lock_nest_lock(&sci->lock, &head_cluster->lock);
+	}
+
 	swap_file = p->swap_file;
 	old_block_size = p->old_block_size;
 	p->swap_file = NULL;
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
-	cluster_info = p->cluster_info;
-	p->cluster_info = NULL;
+
+	if (p->flags & SWP_SOLIDSTATE) {
+		struct swap_cluster_info *sci, *head_cluster;
+
+		head_cluster = p->cluster_info;
+		sci = head_cluster + 1;
+		for (ci = 1; ci < nr_cluster; ci++, sci++)
+			spin_unlock(&sci->lock);
+		spin_unlock(&head_cluster->lock);
+	}
+
 	frontswap_map = frontswap_map_get(p);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
@@ -2658,7 +2687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	free_percpu(p->percpu_cluster);
 	p->percpu_cluster = NULL;
 	vfree(swap_map);
-	kvfree(cluster_info);
+	call_rcu(&p->rcu, swap_cluster_info_free);
+	synchronize_rcu();
 	kvfree(frontswap_map);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
@@ -3369,10 +3399,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 		goto bad_file;
 	p = swap_info[type];
 	offset = swp_offset(entry);
-	if (unlikely(offset >= p->max))
-		goto out;
 
 	ci = lock_cluster_or_swap_info(p, offset);
+	if (unlikely(offset >= p->max))
+		goto unlock_out;
 
 	count = p->swap_map[offset];
 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>