On 17/08/2017 02:05, Jérôme Glisse wrote: > HMM pages (private or public device pages) are ZONE_DEVICE page and > thus need special handling when it comes to lru or refcount. This > patch make sure that memcontrol properly handle those when it face > them. Those pages are use like regular pages in a process address > space either as anonymous page or as file back page. So from memcg > point of view we want to handle them like regular page for now at > least. > > Changed since v3: > - remove public support and move those chunk to separate patch > Changed since v2: > - s/host/public > Changed since v1: > - s/public/host > - add comments explaining how device memory behave and why > > Signed-off-by: Jérôme Glisse <jglisse@xxxxxxxxxx> > Acked-by: Balbir Singh <bsingharora@xxxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxxxx> > Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> > Cc: cgroups@xxxxxxxxxxxxxxx > --- > kernel/memremap.c | 1 + > mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 49 insertions(+), 4 deletions(-) > > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 398630c1fba3..f42d7483e886 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page) > __ClearPageWaiters(page); > > page->mapping = NULL; > + mem_cgroup_uncharge(page); > > page->pgmap->page_free(page, page->pgmap->data); > } else if (!count) > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 604fb3ca8028..977d1cf3493a 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4407,12 +4407,13 @@ enum mc_target_type { > MC_TARGET_NONE = 0, > MC_TARGET_PAGE, > MC_TARGET_SWAP, > + MC_TARGET_DEVICE, > }; > > static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > unsigned long addr, pte_t ptent) > { > - struct page *page = vm_normal_page(vma, addr, ptent); > + struct page *page = _vm_normal_page(vma, addr, ptent, true); Hi Jérôme, As _vm_normal_page() is defined later in the patch 18, so this patch should break the bisectability. Cheers, Laurent. > > if (!page || !page_mapped(page)) > return NULL; > @@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, > return page; > } > > -#ifdef CONFIG_SWAP > +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) > static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > pte_t ptent, swp_entry_t *entry) > { > @@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > > if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) > return NULL; > + > + /* > + * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to > + * a device and because they are not accessible by CPU they are store > + * as special swap entry in the CPU page table. > + */ > + if (is_device_private_entry(ent)) { > + page = device_private_entry_to_page(ent); > + /* > + * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have > + * a refcount of 1 when free (unlike normal page) > + */ > + if (!page_ref_add_unless(page, 1, 1)) > + return NULL; > + return page; > + } > + > /* > * Because lookup_swap_cache() updates some statistics counter, > * we call find_get_page() with swapper_space directly. > @@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page, > * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a > * target for charge migration. if @target is not NULL, the entry is stored > * in target->ent. > + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE > + * (so ZONE_DEVICE page and thus not on the lru). For now we such page is > + * charge like a regular page would be as for all intent and purposes it is > + * just special memory taking the place of a regular page. > + * > + * See Documentations/vm/hmm.txt and include/linux/hmm.h > * > * Called with pte lock held. > */ > @@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, > */ > if (page->mem_cgroup == mc.from) { > ret = MC_TARGET_PAGE; > + if (is_device_private_page(page)) > + ret = MC_TARGET_DEVICE; > if (target) > target->page = page; > } > @@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, > > ptl = pmd_trans_huge_lock(pmd, vma); > if (ptl) { > + /* > + * Note their can not be MC_TARGET_DEVICE for now as we do not > + * support transparent huge page with MEMORY_DEVICE_PUBLIC or > + * MEMORY_DEVICE_PRIVATE but this might change. > + */ > if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) > mc.precharge += HPAGE_PMD_NR; > spin_unlock(ptl); > @@ -4908,6 +4939,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > putback_lru_page(page); > } > put_page(page); > + } else if (target_type == MC_TARGET_DEVICE) { > + page = target.page; > + if (!mem_cgroup_move_account(page, true, > + mc.from, mc.to)) { > + mc.precharge -= HPAGE_PMD_NR; > + mc.moved_charge += HPAGE_PMD_NR; > + } > + put_page(page); > } > spin_unlock(ptl); > return 0; > @@ -4919,12 +4958,16 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; addr += PAGE_SIZE) { > pte_t ptent = *(pte++); > + bool device = false; > swp_entry_t ent; > > if (!mc.precharge) > break; > > switch (get_mctgt_type(vma, addr, ptent, &target)) { > + case MC_TARGET_DEVICE: > + device = true; > + /* fall through */ > case MC_TARGET_PAGE: > page = target.page; > /* > @@ -4935,7 +4978,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > */ > if (PageTransCompound(page)) > goto put; > - if (isolate_lru_page(page)) > + if (!device && isolate_lru_page(page)) > goto put; > if (!mem_cgroup_move_account(page, false, > mc.from, mc.to)) { > @@ -4943,7 +4986,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, > /* we uncharge from mc.from later. */ > mc.moved_charge++; > } > - putback_lru_page(page); > + if (!device) > + putback_lru_page(page); > put: /* get_mctgt_type() gets the page */ > put_page(page); > break; > -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html