Dan Carpenter reports a potential NULL dereference in get_swap_page_of_type: Smatch complains that the NULL checks on "si" aren't consistent. This seems like a real bug because we have not ensured that the type is valid and so "si" can be NULL. Add the missing check for NULL, taking care to use a read barrier to ensure CPU1 observes CPU0's updates in the correct order: CPU0 CPU1 alloc_swap_info() if (type >= nr_swapfiles) swap_info[type] = p /* handle invalid entry */ smp_wmb() smp_rmb() ++nr_swapfiles p = swap_info[type] Without smp_rmb, CPU1 might observe CPU0's write to nr_swapfiles before CPU0's write to swap_info[type] and read NULL from swap_info[type]. Ying Huang noticed that other places don't order these reads properly. Introduce swap_type_to_swap_info to encourage correct usage. Use READ_ONCE and WRITE_ONCE to follow the Linux Kernel Memory Model (see tools/memory-model/Documentation/explanation.txt). This ordering need not be enforced in places where swap_lock is held (e.g. si_swapinfo) because swap_lock serializes updates to nr_swapfiles and the swap_info array. This is a theoretical problem, no actual reports of it exist. Fixes: ec8acf20afb8 ("swap: add per-partition lock for swapfile") Reported-by: Dan Carpenter <dan.carpenter@xxxxxxxxxx> Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> Cc: Alan Stern <stern@xxxxxxxxxxxxxxxxxxx> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx> Cc: Andrea Parri <andrea.parri@xxxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Dan Carpenter <dan.carpenter@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: Huang Ying <ying.huang@xxxxxxxxx> Cc: Omar Sandoval <osandov@xxxxxx> Cc: Paul McKenney <paulmck@xxxxxxxxxxxxxxxxxx> Cc: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Cc: Shaohua Li <shli@xxxxxxxxxx> Cc: Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Will Deacon <will.deacon@xxxxxxx> --- I'd appreciate it if someone more familiar with memory barriers could check this over. Thanks. Probably no need for stable, this is all theoretical. Against linux-mmotm tag v5.0-rc1-mmotm-2019-01-09-13-40 mm/swapfile.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f0edf7244256..dad52fc67045 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -99,6 +99,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= READ_ONCE(nr_swapfiles)) + return NULL; + + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ + return READ_ONCE(swap_info[type]); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -1045,12 +1054,14 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si; + struct swap_info_struct *si = swap_type_to_swap_info(type); pgoff_t offset; - si = swap_info[type]; + if (!si) + goto fail; + spin_lock(&si->lock); - if (si && (si->flags & SWP_WRITEOK)) { + if (si->flags & SWP_WRITEOK) { atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); @@ -1061,6 +1072,7 @@ swp_entry_t get_swap_page_of_type(int type) atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); +fail: return (swp_entry_t) {0}; } @@ -1072,9 +1084,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + p = swap_type_to_swap_info(type); + if (!p) goto bad_nofile; - p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -1212,9 +1224,9 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + si = swap_type_to_swap_info(type); + if (!si) goto bad_nofile; - si = swap_info[type]; preempt_disable(); if (!(si->flags & SWP_VALID)) @@ -1765,10 +1777,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) sector_t swapdev_block(int type, pgoff_t offset) { struct block_device *bdev; + struct swap_info_struct *si = swap_type_to_swap_info(type); - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) + if (!si || !(si->flags & SWP_WRITEOK)) return 0; return map_swap_entry(swp_entry(type, offset), &bdev); } @@ -2799,9 +2810,9 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; type < nr_swapfiles; type++) { + for (type = 0; type < READ_ONCE(nr_swapfiles); type++) { smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + si = READ_ONCE(swap_info[type]); if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2821,9 +2832,9 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; - for (; type < nr_swapfiles; type++) { + for (; type < READ_ONCE(nr_swapfiles); type++) { smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + si = READ_ONCE(swap_info[type]); if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; @@ -2930,14 +2941,14 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= nr_swapfiles) { p->type = type; - swap_info[type] = p; + WRITE_ONCE(swap_info[type], p); /* * Write swap_info[type] before nr_swapfiles, in case a * racing procfs swap_start() or swap_next() is reading them. * (We never shrink nr_swapfiles, we never free this entry.) */ smp_wmb(); - nr_swapfiles++; + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { kvfree(p); p = swap_info[type]; -- 2.20.0