On 05/01/2018 06:19 PM, TSUKADA Koutaro wrote: > If nr_overcommit_hugepages is assumed to be infinite, allocating pages for > hugetlb's overcommit from buddy pool is all unlimited even if being limited > by memcg. The purpose of this patch is that if we allocate the hugetlb page > from the boddy pool, that means we should charge it to memcg. > > A straightforward way for user applications to use hugetlb pages is to > create the pool(nr_hugepages), but root privileges is required. For example, > assuming the field of HPC, it can be said that giving root privs to general > users is difficult. Also, the way to the creating pool is that we need to > estimate exactly how much use hugetlb, and it feels a bit troublesome. > > In such a case, using hugetlb's overcommit feature, considered to let the > user use hugetlb page only with overcommit without creating the any pool. > However, as mentioned earlier, the page can be allocated limitelessly in > overcommit in the current implementation. Therefore, by introducing memcg > charging, I wanted to be able to manage the memory resources used by the > user application only with memcg's limitation. > > This patch targets RHELSA(kernel-alt-4.11.0-45.6.1.el7a.src.rpm). It would be very helpful to rebase this patch on a recent mainline kernel. The code to allocate surplus huge pages has been significantly changed in recent kernels. I have no doubt that this is a real issue and we are not correctly charging surplus to a memcg. But your patch will be hard to evaluate when based on an older distro kernel. > The patch > does the following things. > > When allocating the page from buddy at __hugetlb_alloc_buddy_huge_page, > set the flag of HUGETLB_OVERCOMMIT on that page[1].private. When actually > use the page which HUGETLB_OVERCOMMIT is set(at hugepage_add_new_anon_rmap > or huge_add_to_page_cache), it tries to charge to memcg. If the charge is > successful, add the flag of HUGETLB_OVERCOMMIT_CHARGED on that page[1]. What is the reason for not charging pages at allocation/reserve time? I am not an expert in memcg accounting, but I would think the pages should be charged at allocation time. Otherwise, a task could allocate a large number of (reserved) pages that are not charged to a memcg. memcg charges in other code paths seem to happen at huge page allocation time. -- Mike Kravetz > > The page charged to memcg will finally be uncharged at free_huge_page. > > Modification of memcontrol.c is for updating of statistical information > when the task moves cgroup. The hpage_nr_pages works correctly for thp, > but it is not suitable for such as hugetlb which uses the contiguous bit > of aarch64, so need to modify it. > > Signed-off-by: TSUKADA Koutaro <tsukada@xxxxxxxxxxxx> > --- > include/linux/hugetlb.h | 45 ++++++++++++++++++++++ > mm/hugetlb.c | 80 +++++++++++++++++++++++++++++++++++++++ > mm/memcontrol.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++-- > 3 files changed, 219 insertions(+), 4 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index d67675e..67991cb 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -511,6 +511,51 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr > set_huge_pte_at(mm, addr, ptep, pte); > } > #endif > + > +#define HUGETLB_OVERCOMMIT 1UL > +#define HUGETLB_OVERCOMMIT_CHARGED 2UL > + > +static inline void add_hugetlb_compound_private(struct page *page, > + unsigned long val) > +{ > + page[1].private |= val; > +} > + > +static inline unsigned long get_hugetlb_compound_private(struct page *page) > +{ > + return page_private(&page[1]); > +} > + > +static inline void add_hugetlb_overcommit(struct page *page) > +{ > + add_hugetlb_compound_private(page, HUGETLB_OVERCOMMIT); > +} > + > +static inline void del_hugetlb_overcommit(struct page *page) > +{ > + add_hugetlb_compound_private(page, ~(HUGETLB_OVERCOMMIT)); > +} > + > +static inline int is_hugetlb_overcommit(struct page *page) > +{ > + return (get_hugetlb_compound_private(page) & HUGETLB_OVERCOMMIT); > +} > + > +static inline void add_hugetlb_overcommit_charged(struct page *page) > +{ > + add_hugetlb_compound_private(page, HUGETLB_OVERCOMMIT_CHARGED); > +} > + > +static inline void del_hugetlb_overcommit_charged(struct page *page) > +{ > + add_hugetlb_compound_private(page, ~(HUGETLB_OVERCOMMIT_CHARGED)); > +} > + > +static inline int is_hugetlb_overcommit_charged(struct page *page) > +{ > + return (get_hugetlb_compound_private(page) & > + HUGETLB_OVERCOMMIT_CHARGED); > +} > #else /* CONFIG_HUGETLB_PAGE */ > struct hstate {}; > #define alloc_huge_page(v, a, r) NULL > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 6191fb9..2cd91d9 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -24,6 +24,7 @@ > #include <linux/swapops.h> > #include <linux/page-isolation.h> > #include <linux/jhash.h> > +#include <linux/memcontrol.h> > > #include <asm/page.h> > #include <asm/pgtable.h> > @@ -1227,6 +1228,17 @@ static void clear_page_huge_active(struct page *page) > ClearPagePrivate(&page[1]); > } > > +static void hugetlb_overcommit_finalize(struct page *page) > +{ > + if (is_hugetlb_overcommit_charged(page)) { > + del_hugetlb_overcommit_charged(page); > + mem_cgroup_uncharge(page); > + } > + if (is_hugetlb_overcommit(page)) { > + del_hugetlb_overcommit(page); > + } > +} > + > void free_huge_page(struct page *page) > { > /* > @@ -1239,6 +1251,8 @@ void free_huge_page(struct page *page) > (struct hugepage_subpool *)page_private(page); > bool restore_reserve; > > + hugetlb_overcommit_finalize(page); > + > set_page_private(page, 0); > page->mapping = NULL; > VM_BUG_ON_PAGE(page_count(page), page); > @@ -1620,6 +1634,13 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, > spin_unlock(&hugetlb_lock); > > page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); > + if (page) { > + /* > + * At this point it is impossible to judge whether it is > + * mapped or just reserved, so only mark it. > + */ > + add_hugetlb_overcommit(page); > + } > > spin_lock(&hugetlb_lock); > if (page) { > @@ -3486,6 +3507,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, > int ret = 0, outside_reserve = 0; > unsigned long mmun_start; /* For mmu_notifiers */ > unsigned long mmun_end; /* For mmu_notifiers */ > + struct mem_cgroup *memcg; > + int memcg_charged = 0; > > pte = huge_ptep_get(ptep); > old_page = pte_page(pte); > @@ -3552,6 +3575,15 @@ retry_avoidcopy: > goto out_release_old; > } > > + if (unlikely(is_hugetlb_overcommit(new_page))) { > + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, > + &memcg, true)) { > + ret = VM_FAULT_OOM; > + goto out_release_all; > + } > + memcg_charged = 1; > + } > + > /* > * When the original hugepage is shared one, it does not have > * anon_vma prepared. > @@ -3587,12 +3619,18 @@ retry_avoidcopy: > make_huge_pte(vma, new_page, 1)); > page_remove_rmap(old_page, true); > hugepage_add_new_anon_rmap(new_page, vma, address); > + if (memcg_charged) { > + mem_cgroup_commit_charge(new_page, memcg, false, true); > + add_hugetlb_overcommit_charged(new_page); > + } > /* Make the old page be freed below */ > new_page = old_page; > } > spin_unlock(ptl); > mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); > out_release_all: > + if (memcg_charged) > + mem_cgroup_cancel_charge(new_page, memcg, true); > restore_reserve_on_error(h, vma, address, new_page); > put_page(new_page); > out_release_old: > @@ -3641,9 +3679,18 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, > struct inode *inode = mapping->host; > struct hstate *h = hstate_inode(inode); > int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); > + struct mem_cgroup *memcg; > > if (err) > return err; > + if (page && is_hugetlb_overcommit(page)) { > + err = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, > + &memcg, true); > + if (err) > + return err; > + mem_cgroup_commit_charge(page, memcg, false, true); > + add_hugetlb_overcommit_charged(page); > + } > ClearPagePrivate(page); > > spin_lock(&inode->i_lock); > @@ -3663,6 +3710,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, > struct page *page; > pte_t new_pte; > spinlock_t *ptl; > + struct mem_cgroup *memcg; > + int memcg_charged = 0; > > /* > * Currently, we are forced to kill the process in the event the > @@ -3740,6 +3789,14 @@ retry: > } > } else { > lock_page(page); > + if (unlikely(is_hugetlb_overcommit(page))) { > + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, > + &memcg, true)) { > + ret = VM_FAULT_OOM; > + goto backout_unlocked; > + } > + memcg_charged = 1; > + } > if (unlikely(anon_vma_prepare(vma))) { > ret = VM_FAULT_OOM; > goto backout_unlocked; > @@ -3786,6 +3843,10 @@ retry: > if (anon_rmap) { > ClearPagePrivate(page); > hugepage_add_new_anon_rmap(page, vma, address); > + if (memcg_charged) { > + mem_cgroup_commit_charge(page, memcg, false, true); > + add_hugetlb_overcommit_charged(page); > + } > } else > page_dup_rmap(page, true); > new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) > @@ -3806,6 +3867,8 @@ out: > backout: > spin_unlock(ptl); > backout_unlocked: > + if (memcg_charged) > + mem_cgroup_cancel_charge(page, memcg, true); > unlock_page(page); > restore_reserve_on_error(h, vma, address, page); > put_page(page); > @@ -4002,6 +4065,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > spinlock_t *ptl; > int ret; > struct page *page; > + struct mem_cgroup *memcg; > + int memcg_charged = 0; > + > > if (!*pagep) { > ret = -ENOMEM; > @@ -4045,6 +4111,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > goto out_release_nounlock; > } > > + if (!vm_shared && is_hugetlb_overcommit(page)) { > + ret = -ENOMEM; > + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, > + &memcg, true)) > + goto out_release_nounlock; > + memcg_charged = 1; > + } > + > ptl = huge_pte_lockptr(h, dst_mm, dst_pte); > spin_lock(ptl); > > @@ -4057,6 +4131,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > } else { > ClearPagePrivate(page); > hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); > + if (memcg_charged) { > + mem_cgroup_commit_charge(page, memcg, false, true); > + add_hugetlb_overcommit_charged(page); > + } > } > > _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); > @@ -4082,6 +4160,8 @@ out: > out_release_unlock: > spin_unlock(ptl); > out_release_nounlock: > + if (memcg_charged) > + mem_cgroup_cancel_charge(page, memcg, true); > if (vm_shared) > unlock_page(page); > put_page(page); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 02cfcd9..1842693 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4531,7 +4531,7 @@ static int mem_cgroup_move_account(struct page *page, > struct mem_cgroup *to) > { > unsigned long flags; > - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; > + unsigned int nr_pages = compound ? 1 << compound_order(page) : 1; > int ret; > bool anon; > > @@ -4744,12 +4744,64 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, > return 0; > } > > +#ifdef CONFIG_HUGETLB_PAGE > +static enum mc_target_type get_mctgt_type_hugetlb(struct vm_area_struct *vma, > + unsigned long addr, pte_t *pte, union mc_target *target) > +{ > + struct page *page = NULL; > + pte_t entry; > + enum mc_target_type ret = MC_TARGET_NONE; > + > + if (!(mc.flags & MOVE_ANON)) > + return ret; > + > + entry = huge_ptep_get(pte); > + if (!pte_present(entry)) > + return ret; > + page = pte_page(entry); > + VM_BUG_ON_PAGE(!page || !PageHead(page), page); > + if (!is_hugetlb_overcommit_charged(page)) > + return ret; > + if (page->mem_cgroup == mc.from) { > + ret = MC_TARGET_PAGE; > + if (target) { > + get_page(page); > + target->page = page; > + } > + } > + > + return ret; > +} > + > +static int hugetlb_count_precharge_pte_range(pte_t *pte, unsigned long hmask, > + unsigned long addr, unsigned long end, > + struct mm_walk *walk) > +{ > + struct vm_area_struct *vma = walk->vma; > + struct mm_struct *mm = walk->mm; > + spinlock_t *ptl; > + union mc_target target; > + > + ptl = huge_pte_lock(hstate_vma(vma), mm, pte); > + if (get_mctgt_type_hugetlb(vma, addr, pte, &target) == MC_TARGET_PAGE) { > + mc.precharge += (1 << compound_order(target.page)); > + put_page(target.page); > + } > + spin_unlock(ptl); > + > + return 0; > +} > +#endif > + > static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) > { > unsigned long precharge; > > struct mm_walk mem_cgroup_count_precharge_walk = { > .pmd_entry = mem_cgroup_count_precharge_pte_range, > +#ifdef CONFIG_HUGETLB_PAGE > + .hugetlb_entry = hugetlb_count_precharge_pte_range, > +#endif > .mm = mm, > }; > down_read(&mm->mmap_sem); > @@ -5023,10 +5075,48 @@ put: /* get_mctgt_type() gets the page */ > return ret; > } > > +#ifdef CONFIG_HUGETLB_PAGE > +static int hugetlb_move_charge_pte_range(pte_t *pte, unsigned long hmask, > + unsigned long addr, unsigned long end, > + struct mm_walk *walk) > +{ > + struct vm_area_struct *vma = walk->vma; > + struct mm_struct *mm = walk->mm; > + spinlock_t *ptl; > + enum mc_target_type target_type; > + union mc_target target; > + struct page *page; > + unsigned long nr_pages; > + > + ptl = huge_pte_lock(hstate_vma(vma), mm, pte); > + target_type = get_mctgt_type_hugetlb(vma, addr, pte, &target); > + if (target_type == MC_TARGET_PAGE) { > + page = target.page; > + nr_pages = (1 << compound_order(page)); > + if (mc.precharge < nr_pages) { > + put_page(page); > + goto unlock; > + } > + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { > + mc.precharge -= nr_pages; > + mc.moved_charge += nr_pages; > + } > + put_page(page); > + } > +unlock: > + spin_unlock(ptl); > + > + return 0; > +} > +#endif > + > static void mem_cgroup_move_charge(void) > { > struct mm_walk mem_cgroup_move_charge_walk = { > .pmd_entry = mem_cgroup_move_charge_pte_range, > +#ifdef CONFIG_HUGETLB_PAGE > + .hugetlb_entry = hugetlb_move_charge_pte_range, > +#endif > .mm = mc.mm, > }; > > @@ -5427,7 +5517,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, > bool compound) > { > struct mem_cgroup *memcg = NULL; > - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; > + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1; > int ret = 0; > > if (mem_cgroup_disabled()) > @@ -5488,7 +5578,7 @@ out: > void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, > bool lrucare, bool compound) > { > - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; > + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1; > > VM_BUG_ON_PAGE(!page->mapping, page); > VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); > @@ -5532,7 +5622,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, > void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, > bool compound) > { > - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; > + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1; > > if (mem_cgroup_disabled()) > return; >