From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> CC'ed to Mel and Chrisotph, KOSAKi because I wanted to be double-cheked that I miss something important at isolate_lru_page(). Checking perofrmance of memcg's move_account with perf, you may notice that put_page() is too much. # # Overhead Command Shared Object Symbol # ........ ....... ................. ..................................... # 14.24% echo [kernel.kallsyms] [k] put_page 12.80% echo [kernel.kallsyms] [k] isolate_lru_page 9.67% echo [kernel.kallsyms] [k] is_target_pte_for_mc 8.11% echo [kernel.kallsyms] [k] ____pagevec_lru_add 7.22% echo [kernel.kallsyms] [k] putback_lru_page This is because mc_handle_present_pte() do get_page(). Then, page->count is updated 4 times. get_page_unless_zero() #1 isolate_lru_page() putback_lru_page() put_page() But above is called all under pte_offset_map_lock(). get_page_unless_zero() #1 is not necessary because we do all under a pte_offset_map_lock(). isolate_lru_page()'s comment says # Restrictions: # (1) Must be called with an elevated refcount on the page. This is a # fundamentnal difference from isolate_lru_pages (which is called # without a stable reference). So, current implemnation does get_page_unless_zero() explicitly but holding pte_lock() implies a stable reference. I removed #1. Then, Performance will be [Before Patch] [root@bluextal kamezawa]# time echo 2530 > /cgroup/B/tasks real 0m0.792s user 0m0.000s sys 0m0.780s [After Patch] [root@bluextal kamezawa]# time echo 2257 > /cgroup/B/tasks real 0m0.694s user 0m0.000s sys 0m0.683s perf's log is 10.82% echo [kernel.kallsyms] [k] isolate_lru_page 10.01% echo [kernel.kallsyms] [k] mem_cgroup_move_account 8.75% echo [kernel.kallsyms] [k] is_target_pte_for_mc 8.52% echo [kernel.kallsyms] [k] ____pagevec_lru_add 6.90% echo [kernel.kallsyms] [k] putback_lru_page 6.36% echo [kernel.kallsyms] [k] mem_cgroup_add_lru_list 6.22% echo [kernel.kallsyms] [k] mem_cgroup_del_lru_list 5.68% echo [kernel.kallsyms] [k] lookup_page_cgroup 5.28% echo [kernel.kallsyms] [k] __lru_cache_add 5.00% echo [kernel.kallsyms] [k] release_pages 3.79% echo [kernel.kallsyms] [k] _raw_spin_lock_irq 3.52% echo [kernel.kallsyms] [k] memcg_check_events 3.38% echo [kernel.kallsyms] [k] bit_spin_lock 3.25% echo [kernel.kallsyms] [k] put_page seems nice. I updated isolate_lru_page()'s comment, too. # Note: isolate_lru_page() is necessary before account move for avoinding memcg's LRU manipulation. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++++++++--------------------- mm/vmscan.c | 3 +- 2 files changed, 42 insertions(+), 24 deletions(-) Index: mmotm-1013/mm/memcontrol.c =================================================================== --- mmotm-1013.orig/mm/memcontrol.c +++ mmotm-1013/mm/memcontrol.c @@ -1169,7 +1169,6 @@ static void mem_cgroup_end_move(struct m * under hierarchy of moving cgroups. This is for * waiting at hith-memory prressure caused by "move". */ - static bool mem_cgroup_stealed(struct mem_cgroup *mem) { VM_BUG_ON(!rcu_read_lock_held()); @@ -4471,11 +4470,14 @@ one_by_one: * Returns * 0(MC_TARGET_NONE): if the pte is not a target for move charge. * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for - * move charge. if @target is not NULL, the page is stored in target->page - * with extra refcnt got(Callers should handle it). + * move charge and it's mapped.. if @target is not NULL, the page is + * stored in target->pagewithout extra refcnt. * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. + * 3(MC_TARGET_UNMAPPED_PAGE): if the page corresponding to this pte is a + * target for move charge. if @target is not NULL, the page is stored in + * target->page with extra refcnt got(Callers should handle it). * * Called with pte lock held. */ @@ -4486,8 +4488,9 @@ union mc_target { enum mc_target_type { MC_TARGET_NONE, /* not used */ - MC_TARGET_PAGE, + MC_TARGET_PAGE, /* a page mapped */ MC_TARGET_SWAP, + MC_TARGET_UNMAPPED_PAGE, /* a page unmapped */ }; static struct page *mc_handle_present_pte(struct vm_area_struct *vma, @@ -4504,9 +4507,10 @@ static struct page *mc_handle_present_pt } else if (!move_file()) /* we ignore mapcount for file pages */ return NULL; - if (!get_page_unless_zero(page)) - return NULL; - + /* + * Because we're under pte_lock and the page is mapped, + * get_page() isn't necessary + */ return page; } @@ -4570,14 +4574,18 @@ static int is_target_pte_for_mc(struct v struct page *page = NULL; struct page_cgroup *pc; int ret = 0; + bool present = true; swp_entry_t ent = { .val = 0 }; if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); - else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, addr, ptent, &ent); - else if (pte_none(ptent) || pte_file(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + else { + present = false; + if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); + } if (!page && !ent.val) return 0; @@ -4589,11 +4597,15 @@ static int is_target_pte_for_mc(struct v * the lock. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { - ret = MC_TARGET_PAGE; + if (present) + ret = MC_TARGET_PAGE; + else + ret = MC_TARGET_UNMAPPED_PAGE; if (target) target->page = page; } - if (!ret || !target) + /* We got refcnt but the page is not for target */ + if (!present && (!ret || !target)) put_page(page); } /* There is a swap entry and a page doesn't exist or isn't charged */ @@ -4780,19 +4792,24 @@ retry: type = is_target_pte_for_mc(vma, addr, ptent, &target); switch (type) { case MC_TARGET_PAGE: + case MC_TARGET_UNMAPPED_PAGE: page = target.page; - if (isolate_lru_page(page)) - goto put; - pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(pc, + if (!isolate_lru_page(page)) { + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(pc, mc.from, mc.to, false)) { - mc.precharge--; - /* we uncharge from mc.from later. */ - mc.moved_charge++; + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + putback_lru_page(page); } - putback_lru_page(page); -put: /* is_target_pte_for_mc() gets the page */ - put_page(page); + /* + * Because we holds pte_lock, we have a stable reference * to the page if mapped. If not mapped, we have an + * elevated refcnt. drop it. + */ + if (type == MC_TARGET_UNMAPPED_PAGE) + put_page(page); break; case MC_TARGET_SWAP: ent = target.ent; Index: mmotm-1013/mm/vmscan.c =================================================================== --- mmotm-1013.orig/mm/vmscan.c +++ mmotm-1013/mm/vmscan.c @@ -1166,7 +1166,8 @@ static unsigned long clear_active_flags( * found will be decremented. * * Restrictions: - * (1) Must be called with an elevated refcount on the page. This is a + * (1) Must be called with an elevated refcount on the page, IOW, the + * caller must guarantee that there is a stable reference. This is a * fundamentnal difference from isolate_lru_pages (which is called * without a stable reference). * (2) the lru_lock must not be held. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>