I'm now testing this patch, removing PCG_ACCT_LRU, onto mmotm. How do you think ? Here is a performance score at running page fault test. == [Before] 11.20% malloc [kernel.kallsyms] [k] clear_page_c .... 1.80% malloc [kernel.kallsyms] [k] __mem_cgroup_commit_charge 0.94% malloc [kernel.kallsyms] [k] mem_cgroup_lru_add_list 0.87% malloc [kernel.kallsyms] [k] mem_cgroup_lru_del_list [After] 11.66% malloc [kernel.kallsyms] [k] clear_page_c 2.17% malloc [kernel.kallsyms] [k] __mem_cgroup_commit_charge 0.56% malloc [kernel.kallsyms] [k] mem_cgroup_lru_add_list 0.25% malloc [kernel.kallsyms] [k] mem_cgroup_lru_del_list == seems attractive to me. == >From 882fefc1681591af5a1a3ac697012cef7952a462 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Date: Fri, 2 Dec 2011 19:11:33 +0900 Subject: [PATCH] memcg: remove PCG_LRU_Acct. Now. memcg uses PCG_LRU_ACCT bit for checking the page is on LRU or not. Now, page->lru is used for lru and it seems not to be necessary to check PCG_ACCT_LRU flag, which adds atomic operations in lru handling. - This patch removes it. And added a debug param. - This patch modifies the case where the page is on LRU before commit charge. This new logic does.. zone->lru_lock if (PageLru(page)) remove from LRU commit_charge() if (removed_from_lru) add page to LRU again. zone->lru_unlock Then, swapin-and-map-page will be handled in the safe way without any races. But yes, need heavy test. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- include/linux/page_cgroup.h | 10 +--- mm/memcontrol.c | 102 ++++++++++-------------------------------- 2 files changed, 28 insertions(+), 84 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index aaa60da..92ab60a 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -11,7 +11,6 @@ enum { PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ /* No lock in page_cgroup */ - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ __NR_PCG_FLAGS, }; @@ -31,6 +30,9 @@ enum { struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; +#ifdef CONFIG_DEBUG_VM + struct mem_cgroup *mem_cgroup_lru; +#endif }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); @@ -75,12 +77,6 @@ TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) -SETPCGFLAG(AcctLRU, ACCT_LRU) -CLEARPCGFLAG(AcctLRU, ACCT_LRU) -TESTPCGFLAG(AcctLRU, ACCT_LRU) -TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) - - SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8880a32..9b9b81c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -974,7 +974,6 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, return &zone->lruvec; pc = lookup_page_cgroup(page); - VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * putback: charge: * SetPageLRU SetPageCgroupUsed @@ -995,9 +994,12 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); memcg = pc->mem_cgroup; - SetPageCgroupAcctLRU(pc); } else memcg = root_mem_cgroup; +#ifdef CONFIG_DEBUG_VM + pc->mem_cgroup_lru = memcg; +#endif + mz = page_cgroup_zoneinfo(memcg, page); /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); @@ -1024,18 +1026,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * root_mem_cgroup babysits uncharged LRU pages, but - * PageCgroupUsed is cleared when the page is about to get - * freed. PageCgroupAcctLRU remembers whether the - * LRU-accounting happened against pc->mem_cgroup or - * root_mem_cgroup. - */ - if (TestClearPageCgroupAcctLRU(pc)) { - VM_BUG_ON(!pc->mem_cgroup); - memcg = pc->mem_cgroup; - } else - memcg = root_mem_cgroup; + memcg = pc->mem_cgroup ? pc->mem_cgroup : root_mem_cgroup; + VM_BUG_ON(memcg != pc->mem_cgroup_lru); mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); @@ -1074,29 +1066,15 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed * while it's linked to lru because the page may be reused after it's fully * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. - * It's done under lock_page and expected that zone->lru_lock isnever held. + * It's done under lock_page. zone->lru_lock is held in the caller. */ -static void mem_cgroup_lru_del_before_commit(struct page *page) +static bool mem_cgroup_lru_del_before_commit(struct page *page, + struct page_cgroup *pc) { - enum lru_list lru; - unsigned long flags; struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); + bool removed =false; /* - * Doing this check without taking ->lru_lock seems wrong but this - * is safe. Because if page_cgroup's USED bit is unset, the page - * will not be added to any memcg's LRU. If page_cgroup's USED bit is - * set, the commit after this will fail, anyway. - * This all charge/uncharge is done under some mutual execustion. - * So, we don't need to taking care of changes in USED bit. - */ - if (likely(!PageLRU(page))) - return; - - spin_lock_irqsave(&zone->lru_lock, flags); - lru = page_lru(page); - /* * The uncharged page could still be registered to the LRU of * the stale pc->mem_cgroup. * @@ -1107,47 +1085,11 @@ static void mem_cgroup_lru_del_before_commit(struct page *page) * The PCG_USED bit is guarded by lock_page() as the page is * swapcache/pagecache. */ - if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) { - del_page_from_lru_list(zone, page, lru); - add_page_to_lru_list(zone, page, lru); - } - spin_unlock_irqrestore(&zone->lru_lock, flags); -} - -static void mem_cgroup_lru_add_after_commit(struct page *page) -{ - enum lru_list lru; - unsigned long flags; - struct zone *zone = page_zone(page); - struct page_cgroup *pc = lookup_page_cgroup(page); - /* - * putback: charge: - * SetPageLRU SetPageCgroupUsed - * smp_mb smp_mb - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU - * - * Ensure that one of the two sides adds the page to the memcg - * LRU during a race. - */ - smp_mb(); - /* taking care of that the page is added to LRU while we commit it */ - if (likely(!PageLRU(page))) - return; - spin_lock_irqsave(&zone->lru_lock, flags); - lru = page_lru(page); - /* - * If the page is not on the LRU, someone will soon put it - * there. If it is, and also already accounted for on the - * memcg-side, it must be on the right lruvec as setting - * pc->mem_cgroup and PageCgroupUsed is properly ordered. - * Otherwise, root_mem_cgroup has been babysitting the page - * during the charge. Move it to the new memcg now. - */ - if (PageLRU(page) && !PageCgroupAcctLRU(pc)) { - del_page_from_lru_list(zone, page, lru); - add_page_to_lru_list(zone, page, lru); + if (PageLRU(page) && !PageCgroupUsed(pc)) { + del_page_from_lru_list(zone, page, page_lru(page)); + removed = true; } - spin_unlock_irqrestore(&zone->lru_lock, flags); + return removed; } /* @@ -2468,7 +2410,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) + (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2493,8 +2435,8 @@ void mem_cgroup_split_huge_fixup(struct page *head) */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } - - if (PageCgroupAcctLRU(head_pc)) { + /* we're under zone->lru_lock. page's LRU state never changes. */ + if (PageLRU(head)) { enum lru_list lru; struct mem_cgroup_per_zone *mz; /* @@ -2695,14 +2637,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); + struct zone *zone = page_zone(page); + unsigned long flags; + bool removed; /* * In some case, SwapCache, FUSE(splice_buf->radixtree), the page * is already on LRU. It means the page may on some other page_cgroup's * LRU. Take care of it. */ - mem_cgroup_lru_del_before_commit(page); + spin_lock_irqsave(&zone->lru_lock, flags); + removed = mem_cgroup_lru_del_before_commit(page, pc); __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit(page); + if (removed) + add_page_to_lru_list(page_zone(page), page, page_lru(page)); + spin_unlock_irqrestore(&zone->lru_lock, flags); return; } -- 1.7.4.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>