On Wed, Nov 14, 2012 at 03:08:45PM -0800, David Rientjes wrote: > On Wed, 7 Nov 2012, Kirill A. Shutemov wrote: > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index fa06804..fe329da 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) > > } > > #endif > > > > +#ifndef my_zero_pfn > > +static inline unsigned long my_zero_pfn(unsigned long addr) > > +{ > > + extern unsigned long zero_pfn; > > I don't think you should be declaring this inside an inlined function, you > probably should be protecting the declarations of the variable and the > function instead. Perhaps by CONFIG_MMU? mips and s390 use declaration inside inline function to implement is_zero_pfn(). I wanted to be consistent with that. I have patch to cleanup zero page helpers a bit. It's on top of this patchset. http://article.gmane.org/gmane.linux.kernel.mm/87387 > > + return zero_pfn; > > +} > > +#endif > > + > > /* > > * Multiple processes may "see" the same page. E.g. for untouched > > * mappings of /dev/null, all processes see the same page full of > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index 0d903bf..d767a7c 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -824,6 +824,88 @@ out: > > return ret; > > } > > > > +/* no "address" argument so destroys page coloring of some arch */ > > +pgtable_t get_pmd_huge_pte(struct mm_struct *mm) > > +{ > > Umm, this is a copy and paste of pgtable_trans_huge_withdraw() from the > generic page table handling. Why can't you reuse that and support (and/or > modify) the s390 and sparc code? My bad. It's mistake on conflict solving. I'll fix that. > > + pgtable_t pgtable; > > + > > + assert_spin_locked(&mm->page_table_lock); > > + > > + /* FIFO */ > > + pgtable = mm->pmd_huge_pte; > > + if (list_empty(&pgtable->lru)) > > + mm->pmd_huge_pte = NULL; > > + else { > > + mm->pmd_huge_pte = list_entry(pgtable->lru.next, > > + struct page, lru); > > + list_del(&pgtable->lru); > > + } > > + return pgtable; > > +} > > + > > +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, > > + struct vm_area_struct *vma, unsigned long address, > > + pmd_t *pmd, unsigned long haddr) > > This whole function is extremely similar to the implementation of > do_huge_pmd_wp_page_fallback(), there really is no way to fold the two? It's similar by structure (I used do_huge_pmd_wp_page_fallback() as a template) but details are different in many places and I fail to see how to combine them without making result ugly. > Typically in cases like this it's helpful to split out different logical > segments of a function into smaller functions that would handle both > page and !page accordingly. > > > +{ > > + pgtable_t pgtable; > > + pmd_t _pmd; > > + struct page *page; > > + int i, ret = 0; > > + unsigned long mmun_start; /* For mmu_notifiers */ > > + unsigned long mmun_end; /* For mmu_notifiers */ > > + > > + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); > > + if (!page) { > > + ret |= VM_FAULT_OOM; > > + goto out; > > + } > > + > > + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { > > + put_page(page); > > + ret |= VM_FAULT_OOM; > > + goto out; > > + } > > + > > + clear_user_highpage(page, address); > > + __SetPageUptodate(page); > > + > > + mmun_start = haddr; > > + mmun_end = haddr + HPAGE_PMD_SIZE; > > + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); > > + > > + spin_lock(&mm->page_table_lock); > > + pmdp_clear_flush(vma, haddr, pmd); > > + /* leave pmd empty until pte is filled */ > > + > > + pgtable = get_pmd_huge_pte(mm); > > + pmd_populate(mm, &_pmd, pgtable); > > + > > + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { > > + pte_t *pte, entry; > > + if (haddr == (address & PAGE_MASK)) { > > + entry = mk_pte(page, vma->vm_page_prot); > > + entry = maybe_mkwrite(pte_mkdirty(entry), vma); > > + page_add_new_anon_rmap(page, vma, haddr); > > + } else { > > + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); > > + entry = pte_mkspecial(entry); > > + } > > + pte = pte_offset_map(&_pmd, haddr); > > + VM_BUG_ON(!pte_none(*pte)); > > + set_pte_at(mm, haddr, pte, entry); > > + pte_unmap(pte); > > + } > > + smp_wmb(); /* make pte visible before pmd */ > > + pmd_populate(mm, pmd, pgtable); > > + spin_unlock(&mm->page_table_lock); > > + > > + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); > > + > > + ret |= VM_FAULT_WRITE; > > +out: > > + return ret; > > +} > > + > > static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, > > struct vm_area_struct *vma, > > unsigned long address, > > @@ -930,19 +1012,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > unsigned long address, pmd_t *pmd, pmd_t orig_pmd) > > { > > int ret = 0; > > - struct page *page, *new_page; > > + struct page *page = NULL, *new_page; > > unsigned long haddr; > > unsigned long mmun_start; /* For mmu_notifiers */ > > unsigned long mmun_end; /* For mmu_notifiers */ > > > > VM_BUG_ON(!vma->anon_vma); > > + haddr = address & HPAGE_PMD_MASK; > > + if (is_huge_zero_pmd(orig_pmd)) > > + goto alloc; > > spin_lock(&mm->page_table_lock); > > if (unlikely(!pmd_same(*pmd, orig_pmd))) > > goto out_unlock; > > > > page = pmd_page(orig_pmd); > > VM_BUG_ON(!PageCompound(page) || !PageHead(page)); > > - haddr = address & HPAGE_PMD_MASK; > > if (page_mapcount(page) == 1) { > > pmd_t entry; > > entry = pmd_mkyoung(orig_pmd); > > @@ -954,7 +1038,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > } > > get_page(page); > > spin_unlock(&mm->page_table_lock); > > - > > +alloc: > > This could all use a minor restructuring to make it much more cleaner, > perhaps by extracting the page_mapcount(page) == 1 case to be a separate > function that deals with non-copying writes? Makes sense. I'll do it as a separate patch on top of the series. > > > if (transparent_hugepage_enabled(vma) && > > !transparent_hugepage_debug_cow()) > > new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), > > @@ -964,24 +1048,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > > > if (unlikely(!new_page)) { > > count_vm_event(THP_FAULT_FALLBACK); > > - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, > > - pmd, orig_pmd, page, haddr); > > - if (ret & VM_FAULT_OOM) > > - split_huge_page(page); > > - put_page(page); > > + if (is_huge_zero_pmd(orig_pmd)) { > > + ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, > > + address, pmd, haddr); > > + } else { > > + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, > > + pmd, orig_pmd, page, haddr); > > + if (ret & VM_FAULT_OOM) > > + split_huge_page(page); > > + put_page(page); > > + } > > goto out; > > } > > count_vm_event(THP_FAULT_ALLOC); > > > > if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { > > put_page(new_page); > > - split_huge_page(page); > > - put_page(page); > > + if (page) { > > + split_huge_page(page); > > + put_page(page); > > + } > > ret |= VM_FAULT_OOM; > > goto out; > > } > > > > - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); > > + if (is_huge_zero_pmd(orig_pmd)) > > + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); > > + else > > + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); > > __SetPageUptodate(new_page); > > > > mmun_start = haddr; > > @@ -989,7 +1083,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); > > > > spin_lock(&mm->page_table_lock); > > - put_page(page); > > + if (page) > > + put_page(page); > > if (unlikely(!pmd_same(*pmd, orig_pmd))) { > > spin_unlock(&mm->page_table_lock); > > mem_cgroup_uncharge_page(new_page); > > @@ -997,7 +1092,6 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > goto out_mn; > > } else { > > pmd_t entry; > > - VM_BUG_ON(!PageHead(page)); > > entry = mk_pmd(new_page, vma->vm_page_prot); > > entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); > > entry = pmd_mkhuge(entry); > > @@ -1005,8 +1099,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > > page_add_new_anon_rmap(new_page, vma, haddr); > > set_pmd_at(mm, haddr, pmd, entry); > > update_mmu_cache_pmd(vma, address, pmd); > > - page_remove_rmap(page); > > - put_page(page); > > + if (is_huge_zero_pmd(orig_pmd)) > > + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); > > + if (page) { > > Couldn't this be an "else" instead? Yes. I'll update. > > + VM_BUG_ON(!PageHead(page)); > > + page_remove_rmap(page); > > + put_page(page); > > + } > > ret |= VM_FAULT_WRITE; > > } > > spin_unlock(&mm->page_table_lock); -- Kirill A. Shutemov
Attachment:
signature.asc
Description: Digital signature