On Wed 22-10-14 14:29:28, Johannes Weiner wrote: > 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page > migration to uncharge the old page right away. The page is locked, > unmapped, truncated, and off the LRU, but it could race with writeback > ending, which then doesn't unaccount the page properly: > > test_clear_page_writeback() migration > acquire pc->mem_cgroup->move_lock I do not think that mentioning move_lock is important/helpful here because the hot path which is taken all the time (except when there is a task move in progress) doesn't take it. Besides that it is not even relevant for the race. > wait_on_page_writeback() > TestClearPageWriteback() > mem_cgroup_migrate() > clear PCG_USED > if (PageCgroupUsed(pc)) > decrease memcg pages under writeback > release pc->mem_cgroup->move_lock > > The per-page statistics interface is heavily optimized to avoid a > function call and a lookup_page_cgroup() in the file unmap fast path, > which means it doesn't verify whether a page is still charged before > clearing PageWriteback() and it has to do it in the stat update later. > > Rework it so that it looks up the page's memcg once at the beginning > of the transaction and then uses it throughout. The charge will be > verified before clearing PageWriteback() and migration can't uncharge > the page as long as that is still set. The RCU lock will protect the > memcg past uncharge. > > As far as losing the optimization goes, the following test results are > from a microbenchmark that maps, faults, and unmaps a 4GB sparse file > three times in a nested fashion, so that there are two negative passes > that don't account but still go through the new transaction overhead. > There is no actual difference: > > old: 33.195102545 seconds time elapsed ( +- 0.01% ) > new: 33.199231369 seconds time elapsed ( +- 0.03% ) > > The time spent in page_remove_rmap()'s callees still adds up to the > same, but the time spent in the function itself seems reduced: > > # Children Self Command Shared Object Symbol > old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap > new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap > > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: "3.17" <stable@xxxxxxxxxx> Acked-by: Michal Hocko <mhocko@xxxxxxx> Thanks! > --- > include/linux/memcontrol.h | 58 ++++++++++++++-------------------------------- > mm/memcontrol.c | 54 ++++++++++++++++++------------------------ > mm/page-writeback.c | 22 ++++++++++-------- > mm/rmap.c | 20 ++++++++-------- > 4 files changed, 61 insertions(+), 93 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 0daf383f8f1c..ea007615e8f9 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void) > return false; > } > > -void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, > - unsigned long *flags); > - > -extern atomic_t memcg_moving; > - > -static inline void mem_cgroup_begin_update_page_stat(struct page *page, > - bool *locked, unsigned long *flags) > -{ > - if (mem_cgroup_disabled()) > - return; > - rcu_read_lock(); > - *locked = false; > - if (atomic_read(&memcg_moving)) > - __mem_cgroup_begin_update_page_stat(page, locked, flags); > -} > - > -void __mem_cgroup_end_update_page_stat(struct page *page, > - unsigned long *flags); > -static inline void mem_cgroup_end_update_page_stat(struct page *page, > - bool *locked, unsigned long *flags) > -{ > - if (mem_cgroup_disabled()) > - return; > - if (*locked) > - __mem_cgroup_end_update_page_stat(page, flags); > - rcu_read_unlock(); > -} > - > -void mem_cgroup_update_page_stat(struct page *page, > - enum mem_cgroup_stat_index idx, > - int val); > - > -static inline void mem_cgroup_inc_page_stat(struct page *page, > +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, > + unsigned long *flags); > +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, > + unsigned long flags); > +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, > + enum mem_cgroup_stat_index idx, int val); > + > +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx) > { > - mem_cgroup_update_page_stat(page, idx, 1); > + mem_cgroup_update_page_stat(memcg, idx, 1); > } > > -static inline void mem_cgroup_dec_page_stat(struct page *page, > +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx) > { > - mem_cgroup_update_page_stat(page, idx, -1); > + mem_cgroup_update_page_stat(memcg, idx, -1); > } > > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > @@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) > { > } > > -static inline void mem_cgroup_begin_update_page_stat(struct page *page, > +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, > bool *locked, unsigned long *flags) > { > + return NULL; > } > > -static inline void mem_cgroup_end_update_page_stat(struct page *page, > - bool *locked, unsigned long *flags) > +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, > + bool locked, unsigned long flags) > { > } > > @@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) > return false; > } > > -static inline void mem_cgroup_inc_page_stat(struct page *page, > +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx) > { > } > > -static inline void mem_cgroup_dec_page_stat(struct page *page, > +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx) > { > } > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 3a203c7ec6c7..d84f316ac901 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -1463,12 +1463,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) > * start move here. > */ > > -/* for quick checking without looking up memcg */ > -atomic_t memcg_moving __read_mostly; > - > static void mem_cgroup_start_move(struct mem_cgroup *memcg) > { > - atomic_inc(&memcg_moving); > atomic_inc(&memcg->moving_account); > synchronize_rcu(); > } > @@ -1479,10 +1475,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) > * Now, mem_cgroup_clear_mc() may call this function with NULL. > * We check NULL in callee rather than caller. > */ > - if (memcg) { > - atomic_dec(&memcg_moving); > + if (memcg) > atomic_dec(&memcg->moving_account); > - } > } > > /* > @@ -2132,26 +2126,32 @@ cleanup: > * account and taking the move_lock in the slowpath. > */ > > -void __mem_cgroup_begin_update_page_stat(struct page *page, > - bool *locked, unsigned long *flags) > +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, > + bool *locked, > + unsigned long *flags) > { > struct mem_cgroup *memcg; > struct page_cgroup *pc; > > + rcu_read_lock(); > + > + if (mem_cgroup_disabled()) > + return NULL; > + > pc = lookup_page_cgroup(page); > again: > memcg = pc->mem_cgroup; > if (unlikely(!memcg || !PageCgroupUsed(pc))) > - return; > + return NULL; > /* > * If this memory cgroup is not under account moving, we don't > * need to take move_lock_mem_cgroup(). Because we already hold > * rcu_read_lock(), any calls to move_account will be delayed until > * rcu_read_unlock(). > */ > - VM_BUG_ON(!rcu_read_lock_held()); > + *locked = false; > if (atomic_read(&memcg->moving_account) <= 0) > - return; > + return memcg; > > move_lock_mem_cgroup(memcg, flags); > if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { > @@ -2159,36 +2159,26 @@ again: > goto again; > } > *locked = true; > + > + return memcg; > } > > -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) > +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, > + unsigned long flags) > { > - struct page_cgroup *pc = lookup_page_cgroup(page); > + if (memcg && locked) > + move_unlock_mem_cgroup(memcg, &flags); > > - /* > - * It's guaranteed that pc->mem_cgroup never changes while > - * lock is held because a routine modifies pc->mem_cgroup > - * should take move_lock_mem_cgroup(). > - */ > - move_unlock_mem_cgroup(pc->mem_cgroup, flags); > + rcu_read_unlock(); > } > > -void mem_cgroup_update_page_stat(struct page *page, > +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx, int val) > { > - struct mem_cgroup *memcg; > - struct page_cgroup *pc = lookup_page_cgroup(page); > - unsigned long uninitialized_var(flags); > - > - if (mem_cgroup_disabled()) > - return; > - > VM_BUG_ON(!rcu_read_lock_held()); > - memcg = pc->mem_cgroup; > - if (unlikely(!memcg || !PageCgroupUsed(pc))) > - return; > > - this_cpu_add(memcg->stat->count[idx], val); > + if (memcg) > + this_cpu_add(memcg->stat->count[idx], val); > } > > /* > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index ff6a5b07211e..19ceae87522d 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); > int test_clear_page_writeback(struct page *page) > { > struct address_space *mapping = page_mapping(page); > - int ret; > - bool locked; > unsigned long memcg_flags; > + struct mem_cgroup *memcg; > + bool locked; > + int ret; > > - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); > + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); > if (mapping) { > struct backing_dev_info *bdi = mapping->backing_dev_info; > unsigned long flags; > @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) > ret = TestClearPageWriteback(page); > } > if (ret) { > - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); > + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); > dec_zone_page_state(page, NR_WRITEBACK); > inc_zone_page_state(page, NR_WRITTEN); > } > - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); > + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); > return ret; > } > > int __test_set_page_writeback(struct page *page, bool keep_write) > { > struct address_space *mapping = page_mapping(page); > - int ret; > - bool locked; > unsigned long memcg_flags; > + struct mem_cgroup *memcg; > + bool locked; > + int ret; > > - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); > + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); > if (mapping) { > struct backing_dev_info *bdi = mapping->backing_dev_info; > unsigned long flags; > @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) > ret = TestSetPageWriteback(page); > } > if (!ret) { > - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); > + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); > inc_zone_page_state(page, NR_WRITEBACK); > } > - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); > + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); > return ret; > > } > diff --git a/mm/rmap.c b/mm/rmap.c > index 116a5053415b..f574046f77d4 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, > */ > void page_add_file_rmap(struct page *page) > { > - bool locked; > + struct mem_cgroup *memcg; > unsigned long flags; > + bool locked; > > - mem_cgroup_begin_update_page_stat(page, &locked, &flags); > + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); > if (atomic_inc_and_test(&page->_mapcount)) { > __inc_zone_page_state(page, NR_FILE_MAPPED); > - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); > + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); > } > - mem_cgroup_end_update_page_stat(page, &locked, &flags); > + mem_cgroup_end_page_stat(memcg, locked, flags); > } > > /** > @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) > */ > void page_remove_rmap(struct page *page) > { > + struct mem_cgroup *uninitialized_var(memcg); > bool anon = PageAnon(page); > - bool locked; > unsigned long flags; > + bool locked; > > /* > * The anon case has no mem_cgroup page_stat to update; but may > @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) > * we hold the lock against page_stat move: so avoid it on anon. > */ > if (!anon) > - mem_cgroup_begin_update_page_stat(page, &locked, &flags); > + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); > > /* page still mapped by someone else? */ > if (!atomic_add_negative(-1, &page->_mapcount)) > @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) > -hpage_nr_pages(page)); > } else { > __dec_zone_page_state(page, NR_FILE_MAPPED); > - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); > - mem_cgroup_end_update_page_stat(page, &locked, &flags); > + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); > } > if (unlikely(PageMlocked(page))) > clear_page_mlock(page); > @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) > * Leaving it set also helps swapoff to reinstate ptes > * faster for those pages still in swapcache. > */ > - return; > out: > if (!anon) > - mem_cgroup_end_update_page_stat(page, &locked, &flags); > + mem_cgroup_end_page_stat(memcg, locked, flags); > } > > /* > -- > 2.1.2 > -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>