On 12/31/24 at 01:46am, Kairui Song wrote: > From: Kairui Song <kasong@xxxxxxxxxxx> > > The flag SWP_SCANNING was used as an indicator of whether a device > is being scanned for allocation, and prevents swapoff. Combined with > SWP_WRITEOK, they work as a set of barriers for a clean swapoff: > > 1. Swapoff clears SWP_WRITEOK, allocation requests will see > ~SWP_WRITEOK and abort as it's serialized by si->lock. > 2. Swapoff unuses all allocated entries. > 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing > allocations will stop, preventing UAF. > 4. Now swapoff can free everything safely. > > This will make the allocation path have a hard dependency on > si->lock. Allocation always have to acquire si->lock first for > setting SWP_SCANNING and checking SWP_WRITEOK. > > This commit removes this flag, and just uses the existing per-CPU > refcount instead to prevent UAF in step 3, which serves well for > such usage without dependency on si->lock, and scales very well too. > Just hold a reference during the whole scan and allocation process. > Swapoff will kill and wait for the counter. > > And for preventing any allocation from happening after step 1 so the > unuse in step 2 can ensure all slots are free, swapoff will acquire > the ci->lock of each cluster one by one to ensure all allocations > see ~SWP_WRITEOK and abort. Changing to use si->users is great, while wondering why we need acquire = each ci->lock now. After setup 1, we have cleared SWP_WRITEOK, and take the si off swap_avail_heads list. No matter what, we just need wait for p->comm's completion and continue, why bothering to loop for the ci->lock acquiring? > > This way these dependences on si->lock are gone. And worth noting we > can't kill the refcount as the first step for swapoff as the unuse > process have to acquire the refcount. > > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > include/linux/swap.h | 1 - > mm/swapfile.c | 90 ++++++++++++++++++++++++++++---------------- > 2 files changed, 57 insertions(+), 34 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index e1eeea6307cd..02120f1005d5 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -219,7 +219,6 @@ enum { > SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ > SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ > /* add others here before... */ > - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ > }; > > #define SWAP_CLUSTER_MAX 32UL > diff --git a/mm/swapfile.c b/mm/swapfile.c > index e6e58cfb5178..99fd0b0d84a2 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster > { > unsigned int nr_pages = 1 << order; > > + lockdep_assert_held(&ci->lock); > + > if (!(si->flags & SWP_WRITEOK)) > return false; > > @@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, > { > int n_ret = 0; > > - si->flags += SWP_SCANNING; > - > while (n_ret < nr) { > unsigned long offset = cluster_alloc_swap_entry(si, order, usage); > > @@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, > slots[n_ret++] = swp_entry(si->type, offset); > } > > - si->flags -= SWP_SCANNING; > - > return n_ret; > } > > @@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, > return cluster_alloc_swap(si, usage, nr, slots, order); > } > > +static bool get_swap_device_info(struct swap_info_struct *si) > +{ > + if (!percpu_ref_tryget_live(&si->users)) > + return false; > + /* > + * Guarantee the si->users are checked before accessing other > + * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is > + * up to dated. > + * > + * Paired with the spin_unlock() after setup_swap_info() in > + * enable_swap_info(), and smp_wmb() in swapoff. > + */ > + smp_rmb(); > + return true; > +} > + > int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > { > int order = swap_entry_order(entry_order); > @@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) > /* requeue si to after same-priority siblings */ > plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); > spin_unlock(&swap_avail_lock); > - spin_lock(&si->lock); > - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, > - n_goal, swp_entries, order); > - spin_unlock(&si->lock); > - if (n_ret || size > 1) > - goto check_out; > - cond_resched(); > + if (get_swap_device_info(si)) { > + spin_lock(&si->lock); > + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, > + n_goal, swp_entries, order); > + spin_unlock(&si->lock); > + put_swap_device(si); > + if (n_ret || size > 1) > + goto check_out; > + cond_resched(); > + } > > spin_lock(&swap_avail_lock); > /* > @@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) > si = swp_swap_info(entry); > if (!si) > goto bad_nofile; > - if (!percpu_ref_tryget_live(&si->users)) > + if (!get_swap_device_info(si)) > goto out; > - /* > - * Guarantee the si->users are checked before accessing other > - * fields of swap_info_struct. > - * > - * Paired with the spin_unlock() after setup_swap_info() in > - * enable_swap_info(). > - */ > - smp_rmb(); > offset = swp_offset(entry); > if (offset >= si->max) > goto put_out; > @@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type) > goto fail; > > /* This is called for allocating swap entry, not cache */ > - spin_lock(&si->lock); > - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) > - atomic_long_dec(&nr_swap_pages); > - spin_unlock(&si->lock); > + if (get_swap_device_info(si)) { > + spin_lock(&si->lock); > + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) > + atomic_long_dec(&nr_swap_pages); > + spin_unlock(&si->lock); > + put_swap_device(si); > + } > fail: > return entry; > } > @@ -2562,6 +2574,25 @@ bool has_usable_swap(void) > return ret; > } > > +/* > + * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range > + * see the updated flags, so there will be no more allocations. > + */ > +static void wait_for_allocation(struct swap_info_struct *si) > +{ > + unsigned long offset; > + unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); > + struct swap_cluster_info *ci; > + > + BUG_ON(si->flags & SWP_WRITEOK); > + > + for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { > + ci = lock_cluster(si, offset); > + unlock_cluster(ci); > + offset += SWAPFILE_CLUSTER; > + } > +} > + > SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > { > struct swap_info_struct *p = NULL; > @@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > spin_unlock(&p->lock); > spin_unlock(&swap_lock); > > + wait_for_allocation(p); > + > disable_swap_slots_cache_lock(); > > set_current_oom_origin(); > @@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > spin_lock(&p->lock); > drain_mmlist(); > > - /* wait for anyone still in scan_swap_map_slots */ > - while (p->flags >= SWP_SCANNING) { > - spin_unlock(&p->lock); > - spin_unlock(&swap_lock); > - schedule_timeout_uninterruptible(1); > - spin_lock(&swap_lock); > - spin_lock(&p->lock); > - } > - > swap_file = p->swap_file; > p->swap_file = NULL; > p->max = 0; > -- > 2.47.1 > >