On Wed, Nov 20, 2019 at 8:58 AM Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > > While reviewing the "per lruvec lru_lock for memcg" series, Hugh and I > noticed two places in the existing code where the page -> memcg -> > lruvec lookup can result in a use-after-free bug. This affects cgroup1 > setups that have charge migration enabled. > > To pin page->mem_cgroup, callers need to either have the page locked, > an exclusive refcount (0), or hold the lru_lock and "own" PageLRU > (either ensure it's set, or be the one to hold the page in isolation) > to make cgroup migration fail the isolation step. I think we should add the above para in the comments for better visibility. > Failure to follow > this can result in the page moving out of the memcg and freeing it, > along with its lruvecs, while the observer is dereferencing them. > > 1. isolate_lru_page() calls mem_cgroup_page_lruvec() with the lru_lock > held but before testing PageLRU. It doesn't dereference the returned > lruvec before testing PageLRU, giving the impression that it might > just be safe ordering after all - but mem_cgroup_page_lruvec() itself > touches the lruvec to lazily initialize the pgdat back pointer. This > one is easy to fix, just move the lookup into the PageLRU branch. > > 2. pagevec_lru_move_fn() conveniently looks up the lruvec for all the > callbacks it might get invoked on. Unfortunately, it's the callbacks > that first check PageLRU under the lru_lock, which makes this order > equally unsafe as isolate_lru_page(). Remove the lruvec argument from > the move callbacks and let them do it inside their PageLRU branches. > > Reported-by: Hugh Dickins <hughd@xxxxxxxxxx> > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx> > --- > mm/swap.c | 48 +++++++++++++++++++++++++++++------------------- > mm/vmscan.c | 8 ++++---- > 2 files changed, 33 insertions(+), 23 deletions(-) > > diff --git a/mm/swap.c b/mm/swap.c > index 5341ae93861f..6b015e9532fb 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -188,12 +188,11 @@ int get_kernel_page(unsigned long start, int write, struct page **pages) > EXPORT_SYMBOL_GPL(get_kernel_page); > > static void pagevec_lru_move_fn(struct pagevec *pvec, > - void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), > + void (*move_fn)(struct page *page, void *arg), > void *arg) > { > int i; > struct pglist_data *pgdat = NULL; > - struct lruvec *lruvec; > unsigned long flags = 0; > > for (i = 0; i < pagevec_count(pvec); i++) { > @@ -207,8 +206,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, > spin_lock_irqsave(&pgdat->lru_lock, flags); > } > > - lruvec = mem_cgroup_page_lruvec(page, pgdat); > - (*move_fn)(page, lruvec, arg); > + (*move_fn)(page, arg); > } > if (pgdat) > spin_unlock_irqrestore(&pgdat->lru_lock, flags); > @@ -216,12 +214,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, > pagevec_reinit(pvec); > } > > -static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void pagevec_move_tail_fn(struct page *page, void *arg) > { > int *pgmoved = arg; > > if (PageLRU(page) && !PageUnevictable(page)) { > + struct lruvec *lruvec; > + > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > del_page_from_lru_list(page, lruvec, page_lru(page)); > ClearPageActive(page); > add_page_to_lru_list_tail(page, lruvec, page_lru(page)); > @@ -272,12 +272,14 @@ static void update_page_reclaim_stat(struct lruvec *lruvec, > reclaim_stat->recent_rotated[file]++; > } > > -static void __activate_page(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void __activate_page(struct page *page, void *arg) > { > if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { > int file = page_is_file_cache(page); > int lru = page_lru_base_type(page); > + struct lruvec *lruvec; > + > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > > del_page_from_lru_list(page, lruvec, lru); > SetPageActive(page); > @@ -328,7 +330,7 @@ void activate_page(struct page *page) > > page = compound_head(page); > spin_lock_irq(&pgdat->lru_lock); > - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); > + __activate_page(page, NULL); > spin_unlock_irq(&pgdat->lru_lock); > } > #endif > @@ -498,9 +500,9 @@ void lru_cache_add_active_or_unevictable(struct page *page, > * be write it out by flusher threads as this is much more effective > * than the single-page writeout from reclaim. > */ > -static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void lru_deactivate_file_fn(struct page *page, void *arg) > { > + struct lruvec *lruvec; > int lru, file; > bool active; > > @@ -518,6 +520,8 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, > file = page_is_file_cache(page); > lru = page_lru_base_type(page); > > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > + > del_page_from_lru_list(page, lruvec, lru + active); > ClearPageActive(page); > ClearPageReferenced(page); > @@ -544,12 +548,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, > update_page_reclaim_stat(lruvec, file, 0); > } > > -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void lru_deactivate_fn(struct page *page, void *arg) > { > if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { > int file = page_is_file_cache(page); > int lru = page_lru_base_type(page); > + struct lruvec *lruvec; > + > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > > del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); > ClearPageActive(page); > @@ -561,12 +567,14 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, > } > } > > -static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void lru_lazyfree_fn(struct page *page, void *arg) > { > if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && > !PageSwapCache(page) && !PageUnevictable(page)) { > bool active = PageActive(page); > + struct lruvec *lruvec; > + > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > > del_page_from_lru_list(page, lruvec, > LRU_INACTIVE_ANON + active); > @@ -921,15 +929,17 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, > } > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > -static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, > - void *arg) > +static void __pagevec_lru_add_fn(struct page *page, void *arg) > { > - enum lru_list lru; > int was_unevictable = TestClearPageUnevictable(page); > + struct lruvec *lruvec; > + enum lru_list lru; > > VM_BUG_ON_PAGE(PageLRU(page), page); > - > SetPageLRU(page); > + > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > + > /* > * Page becomes evictable in two ways: > * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. > diff --git a/mm/vmscan.c b/mm/vmscan.c > index df859b1d583c..3c8b81990146 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1767,15 +1767,15 @@ int isolate_lru_page(struct page *page) > > if (PageLRU(page)) { > pg_data_t *pgdat = page_pgdat(page); > - struct lruvec *lruvec; > > spin_lock_irq(&pgdat->lru_lock); > - lruvec = mem_cgroup_page_lruvec(page, pgdat); > if (PageLRU(page)) { > - int lru = page_lru(page); > + struct lruvec *lruvec; > + > + lruvec = mem_cgroup_page_lruvec(page, pgdat); > get_page(page); > ClearPageLRU(page); > - del_page_from_lru_list(page, lruvec, lru); > + del_page_from_lru_list(page, lruvec, page_lru(page)); > ret = 0; > } > spin_unlock_irq(&pgdat->lru_lock); > -- > 2.24.0 >