From: Andrea Arcangeli <aarcange@xxxxxxxxxx> Teach memcg to charge/uncharge compound pages. Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> Acked-by: Rik van Riel <riel@xxxxxxxxxx> --- diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -4,6 +4,10 @@ NOTE: The Memory Resource Controller has to as the memory controller in this document. Do not confuse memory controller used here with the memory controller that is used in hardware. +NOTE: When in this documentation we refer to PAGE_SIZE, we actually +mean the real page size of the page being accounted which is bigger than +PAGE_SIZE for compound pages. + Salient features a. Enable control of Anonymous, Page Cache (mapped and unmapped) and diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1509,12 +1509,14 @@ static int __cpuinit memcg_stock_cpu_cal * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct res_counter *fail_res; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1549,8 +1551,9 @@ static int __mem_cgroup_try_charge(struc int ret = 0; unsigned long flags = 0; - if (consume_stock(mem)) - goto done; + if (page_size == PAGE_SIZE) + if (consume_stock(mem)) + goto done; ret = res_counter_charge(&mem->res, csize, &fail_res); if (likely(!ret)) { @@ -1570,8 +1573,8 @@ static int __mem_cgroup_try_charge(struc res); /* reduce request size and retry */ - if (csize > PAGE_SIZE) { - csize = PAGE_SIZE; + if (csize > page_size) { + csize = page_size; continue; } if (!(gfp_mask & __GFP_WAIT)) @@ -1647,8 +1650,10 @@ static int __mem_cgroup_try_charge(struc goto bypass; } } - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); + if (page_size != PAGE_SIZE) + __css_get(&mem->css, page_size); done: return 0; nomem: @@ -1678,9 +1683,10 @@ static void __mem_cgroup_cancel_charge(s /* we don't need css_put for root */ } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -1736,8 +1742,9 @@ struct mem_cgroup *try_get_mem_cgroup_fr */ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) @@ -1746,7 +1753,7 @@ static void __mem_cgroup_commit_charge(s lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } @@ -1820,7 +1827,7 @@ static void __mem_cgroup_move_account(st mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -1881,13 +1888,14 @@ static int mem_cgroup_move_parent(struct goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -1909,6 +1917,10 @@ static int mem_cgroup_charge_common(stru struct mem_cgroup *mem; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -1917,11 +1929,11 @@ static int mem_cgroup_charge_common(stru prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -1930,8 +1942,6 @@ int mem_cgroup_newpage_charge(struct pag { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -1944,7 +1954,7 @@ int mem_cgroup_newpage_charge(struct pag if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } static void @@ -2037,14 +2047,14 @@ int mem_cgroup_try_charge_swapin(struct if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2060,7 +2070,7 @@ __mem_cgroup_commit_charge_swapin(struct cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2109,11 +2119,12 @@ void mem_cgroup_cancel_charge_swapin(str return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2146,14 +2157,14 @@ __do_uncharge(struct mem_cgroup *mem, co if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->bytes += page_size; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_bytes += page_size; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); return; } @@ -2166,6 +2177,10 @@ __mem_cgroup_uncharge_common(struct page struct page_cgroup *pc; struct mem_cgroup *mem = NULL; struct mem_cgroup_per_zone *mz; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2205,7 +2220,7 @@ __mem_cgroup_uncharge_common(struct page } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); @@ -2418,6 +2433,7 @@ int mem_cgroup_prepare_migration(struct struct mem_cgroup *mem = NULL; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2430,7 +2446,8 @@ int mem_cgroup_prepare_migration(struct unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); css_put(&mem->css); } *ptr = mem; @@ -2473,7 +2490,7 @@ void mem_cgroup_end_migration(struct mem * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. * So, double-counting is effectively avoided. */ - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); /* * Both of oldpage and newpage are still under lock_page(). @@ -3940,7 +3957,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4055,6 +4073,7 @@ static int mem_cgroup_count_precharge_pt pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4201,6 +4220,7 @@ static int mem_cgroup_move_charge_pte_ra spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>