Adds some basic vm routines and macros to operate on huge page cache, designed to proper faulting of huge pages. 1. __do_fault - made it common for huge and small. 2. Simple wrappers for huge pages for rmapping. 3. Other changes. Signed-off-by: Radosław Smogura <mail@xxxxxxxxxx> --- include/linux/defrag-pagecache.h | 18 +-- include/linux/fs.h | 19 +- include/linux/mm.h | 28 ++ include/linux/mm_types.h | 2 +- include/linux/rmap.h | 9 + mm/huge_memory.c | 42 +++ mm/memory.c | 528 +++++++++++++++++++++++++++++++------- mm/page-writeback.c | 31 +++ mm/rmap.c | 29 ++ 9 files changed, 582 insertions(+), 124 deletions(-) diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h index 46793de..4ca3468 100644 --- a/include/linux/defrag-pagecache.h +++ b/include/linux/defrag-pagecache.h @@ -8,7 +8,7 @@ #ifndef DEFRAG_PAGECACHE_H #define DEFRAG_PAGECACHE_H -#include <linux/fs.h> +#include <linux/defrag-pagecache.h> /* XXX Split this file into two public and protected - comments below * Protected will contain @@ -24,22 +24,6 @@ typedef struct page *defrag_generic_get_page( const struct defrag_pagecache_ctl *ctl, struct inode *inode, pgoff_t pageIndex); -/** Passes additional information and controls to page defragmentation. */ -struct defrag_pagecache_ctl { - /** If yes defragmentation will try to fill page caches. */ - char fillPages:1; - - /** If filling of page fails, defragmentation will fail too. Setting - * this requires {@link #fillPages} will be setted. - */ - char requireFillPages:1; - - /** If yes defragmentation will try to force in many aspects, this may - * cause, operation to run longer, but with greater probability of - * success. */ - char force:1; -}; - /** Defragments page cache of specified file and migrates it's to huge pages. * * @param f diff --git a/include/linux/fs.h b/include/linux/fs.h index bfd9122..7288166 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,10 +10,7 @@ #include <linux/ioctl.h> #include <linux/blk_types.h> #include <linux/types.h> - -#ifdef CONFIG_HUGEPAGECACHE -#include <linux/defrag-pagecache.h> -#endif +#include <linux/defrag-pagecache-base.h> /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -596,6 +593,9 @@ struct address_space_operations { /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); + /** Same as \a set_page_dirty but for huge page */ + int (*set_page_dirty_huge)(struct page *page); + int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); @@ -606,7 +606,6 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -#ifdef CONFIG_HUGEPAGECACHE /** Used to defrag (migrate) pages at position {@code pos} * to huge pages. Having this not {@code NULL} will indicate that * address space, generally, supports huge pages (transaprent @@ -616,15 +615,19 @@ struct address_space_operations { * * @param pagep on success will be setted to established huge page * - * @returns TODO What to return? - * {@code 0} on success, value less then {@code 0} on error + * @returns {@code 0} on success, value less then {@code 0} on error */ int (*defragpage) (struct file *, struct address_space *mapping, loff_t pos, struct page **pagep, const struct defrag_pagecache_ctl *ctl); -#endif + /** Used to split page, this method may be called under memory + * preasure. Actaully, You should not split page. + */ + int (*split_page) (struct file *file, struct address_space *mapping, + loff_t pos, struct page *hueg_page); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); diff --git a/include/linux/mm.h b/include/linux/mm.h index 72f6a50..27a10c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,10 +206,19 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + /** Same as \a fault but should return huge page, instead of single one. + * If function fails, then caller may try again with fault. + */ + int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + /** Same as \a page_mkwrite, but for huge page. */ + int (*page_mkwrite_huge)(struct vm_area_struct *vma, + struct vm_fault *vmf); + /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ @@ -534,6 +543,16 @@ static inline void get_page(struct page *page) } } +/** Bumps tail pages usage count. If there is at least one page that do not have + * valid mapping page count is left untoach. + */ +extern void get_page_tails_for_fmap(struct page *head); + +/** Decrease tail pages usage count. + * This function assumes you have getted compound or forozen compound. + */ +extern void put_page_tails_for_fmap(struct page *head); + static inline void get_huge_page_tail(struct page *page) { /* @@ -996,6 +1015,7 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ +#define VM_FAULT_NOHUGE 0x0800 /* ->fault_huge, no huge page available .*/ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ @@ -1161,6 +1181,14 @@ int redirty_page_for_writepage(struct writeback_control *wbc, void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); + +/** Sets huge page dirty, this will lock all tails, head should be locked. + * Compound should be getted or frozen. Skips all pages that have no mapping + * + * @param head + * @return number of sucessfull set_page_dirty + */ +int set_page_dirty_huge(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7649722..7d2c09d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -296,7 +296,7 @@ struct vm_area_struct { /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; - + /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1cdd62a..bc547cb 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -142,8 +142,17 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); + +/** Adds remap for huge page, compound page must be getted or frozen. + */ +extern void page_add_file_rmap_huge(struct page *head); + void page_remove_rmap(struct page *); +/** Removes rmap for huge page, compound page must be getted or frozen. + */ +void page_remove_rmap_huge(struct page *); + void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e3b4c38..74d2e84 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2455,3 +2455,45 @@ void __vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_page_address(next->vm_mm, nstart); } } + +/** Bumps tail pages usage count. This function assumes you have getted compound + * or forozen compound. + */ +void get_page_tails_for_fmap(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1); + VM_BUG_ON(compound_order(head) < 2); + + get_page(head + 1); + /* We may use __first_page, because we getts compound at whole. */ + for (page = head + 2; page->__first_page == head; page++) { + VM_BUG_ON(!atomic_read(&page->_count)); + VM_BUG_ON(!page->mapping); + VM_BUG_ON(!PageTail(page)); + get_page(page); + } +} + +/** Decrease tail pages usage count. + * This function assumes you have getted compound or forozen compound. + */ +void put_page_tails_for_fmap(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1); + VM_BUG_ON(compound_order(head) < 2); + + put_page(head + 1); + /* We may use __first_page, because we getts compound at whole. */ + for (page = head + 2; page->__first_page == head; page++) { + VM_BUG_ON(!atomic_read(&page->_count)); + VM_BUG_ON(!page->mapping); + VM_BUG_ON(!PageTail(page)); + put_page(page); + } +} diff --git a/mm/memory.c b/mm/memory.c index a0ab73c..7427c9b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3148,7 +3148,137 @@ oom: return VM_FAULT_OOM; } -/* +/** Level 0 check if it's possible to establish huge pmd in process address + * space. + */ +static int check_if_hugemapping_is_possible0( + struct vm_area_struct *vma, + unsigned long address, + pgoff_t pgoff, + pmd_t pmdVal /* Keep pmd for THP for Pivate Mapping. */) +{ + if (vma->vm_ops) { + /* This is base chcek. */ + if (!vma->vm_ops->fault_huge) + return 0; + } else { + return 0; + } + + if (vma->vm_flags & VM_SHARED && !(vma->vm_flags & VM_NONLINEAR)) { + /* Check if VMA address is pmd aligned */ + if ((address & ~PMD_MASK) != 0) + return 0; + + /* Check if pgoff is huge page aligned */ + /* XXX This should be exported as it's reused in defrag. */ + if ((pgoff & ((1 << (PMD_SHIFT - PAGE_SHIFT)) - 1)) != 0) + return 0; + + /* Check if huge pmd will fit inside VMA. + * pmd_address_end returns first byte after end, not last byte! + */ + if (!(pmd_addr_end(address, (unsigned long) -1) <= vma->vm_end)) + return 0; + + /* WIP [Private THP], check if pmd is marked as do not make THP, + * e.g. because it has COWs. (COWs gives milk). + * We need add such flag because + */ + + /* Check if file has enaugh length - not needed if there is + * huge page in page cache, this implies file has enaugh lenght. + * TODO Think on above. If true make requirement for THP support + * in page cache (put in documentation). + * This may break some concepts that page cache may have not + * up to date huge page, too. + */ + } else { + /* Anonymous VMA - not opcoded, yet. */ + return 0; + } + + /* All tests passed */ + printk(KERN_INFO "Chk - All passed"); + return 1; +} + + +/** Commons function for performing faulting with support for huge pages. + * This method is designed to be facade-ed, by others. + * + * TODO Still need to consider locking order, to prevent dead locks... + * it's looks like better will be compound_lock -> page_lock + * + * @param page loaded head page, locked iff compound_lock, getted + * + * @return {@code 0} on success + */ +static /*inline*/ int __huge_lock_check( + struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pud_t *pud, + pmd_t pmd, + pgoff_t pgoff, + unsigned int flags, + struct page *head) +{ + struct page *workPage; + unsigned long workAddress; + unsigned int processedPages; + + int result = 0; + + VM_BUG_ON(!check_if_hugemapping_is_possible0(vma, address, pgoff, + pmd)); + VM_BUG_ON(atomic_read(&head->_count) <= 2); + VM_BUG_ON(!PageHead(head)); + + /* TODO [Documentation] expose below rules, from code. + * + * XXX Is it possible to with tests in loop to map not uptodate pages? + * + * It's looks like that with following designe we require that removing + * page uptodate flag, for compound pages, may require compound lock + * or something else. + */ + + /* Check if tail pages are uptodate, this should not happen, + * as we have compound_lock, but I can't guarantee and linear ordered. + */ + processedPages = 0; + workAddress = address; + /** XXX [Performance] compound_head is rather slow make new macro, when + * we have compound page getted. + */ + for (workPage = head; compound_head(workPage) == head; workPage++) { + if (!PageUptodate(workPage) + || !workPage->mapping + || (workPage->index - processedPages != pgoff)) { + result = -EINVAL; + goto exit_processing; + } + /* We don't check ptes, because we have shared mapping + * so all ptes should be (or could be in future) same, meaning + * mainly protection flags. This check will be required for + * private mapping. + */ + processedPages++; + workAddress += PAGE_SIZE; + } + if (processedPages != (1 << (PMD_SHIFT - PAGE_SHIFT))) { + /* Not enaugh processed pages, why? */ + return processedPages + 1; + } + +exit_processing: + printk("Processed %d", processedPages); + + return result; +} + +/** * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid @@ -3160,28 +3290,45 @@ oom: * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * This method shares same concepts for single and huge pages. + * + * @param pud pud entry, if NULL method operates in single page mode, otherwise + * operates in huge page mode. */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static inline int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pud_t *pud, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, + pmd_t orig_pmd, pte_t orig_pte) { pte_t *page_table; + pmd_t *huge_table; + + pte_t entry; + pmd_t hentry; + spinlock_t *ptl; struct page *page; struct page *cow_page; - pte_t entry; + int anon = 0; struct page *dirty_page = NULL; struct vm_fault vmf; + const struct vm_operations_struct *vm_ops = vma->vm_ops; int ret; int page_mkwrite = 0; + VM_BUG_ON((!!pmd) == (!!pud)); + /* * If we do COW later, allocate page befor taking lock_page() * on the file cache page. This will reduce lock holding time. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - + if (pud) { + /* Privte mapping write not supported yet. */ + BUG(); + } if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -3196,14 +3343,20 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else cow_page = NULL; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.virtual_address = (void __user *) + (address & (pud ? HPAGE_MASK : PAGE_MASK)); vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; - ret = vma->vm_ops->fault(vma, &vmf); + /** XXX Tails should be getted to. */ + if (pud) + ret = vm_ops->fault_huge(vma, &vmf); + else + ret = vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) + VM_FAULT_RETRY | VM_FAULT_NOHUGE))) goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { @@ -3213,21 +3366,36 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } - /* - * For consistency in subsequent calls, make the faulted page always - * locked. + /* For consistency in subsequent calls, make the faulted page + * always locked. */ if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf.page); else VM_BUG_ON(!PageLocked(vmf.page)); + page = vmf.page; + if (pud) { + /* Check consystency of page, if it is applicable for huge + * mapping. + */ + if (__huge_lock_check(mm, vma, address, pud, orig_pmd, pgoff, + flags, vmf.page)) { + unlock_page(page); + goto unwritable_page; + } + } + /* * Should we do an early C-O-W break? */ - page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + if (pud) { + /* Private cowing not supported yet for huge. */ + BUG(); + } + page = cow_page; anon = 1; copy_user_highpage(page, vmf.page, address, vma); @@ -3238,89 +3406,156 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * address space wants to know that the page is about * to become writable */ - if (vma->vm_ops->page_mkwrite) { + if ((!pud && vm_ops->page_mkwrite) || + (pud && vm_ops->page_mkwrite_huge)) { int tmp; - unlock_page(page); vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + tmp = vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + if (pud) + BUG(); lock_page(page); if (!page->mapping) { ret = 0; /* retry the fault */ - unlock_page(page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(page)); - page_mkwrite = 1; + page_mkwrite = 1 << (PMD_SHIFT - PAGE_SHIFT); } } } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. + /* Following if is almost same for pud and not pud just, specified + * methods changed. Keep it as far as possi ble synchronized */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); + if (pud) { + huge_table = pmd_offset(pud, address); + /* During allocation of pte pte_alloc uses, mm's page table lock + * it is not best solution, but we reuse it here. + */ + ptl = &mm->page_table_lock; + spin_lock(ptl); + if (likely(pmd_same(*huge_table, orig_pmd))) { + flush_icache_page(vma, page);/* TODO Arch specific? */ + hentry = mk_pmd(page, vma->vm_page_prot); + hentry = pmd_mkhuge(hentry); + if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); + hentry = pmd_mkdirty(hentry); + /* TODO make it pmd_maybe_mkwrite*/ + if (likely(vma->vm_flags & VM_WRITE)) + hentry = pmd_mkwrite(hentry); } - } - set_pte_at(mm, address, page_table, entry); + if (anon) { + BUG(); + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + /* TODO Inc of huge pages counter...*/ + add_mm_counter_fast(mm, MM_FILEPAGES, + HPAGE_PMD_NR); + page_add_file_rmap_huge(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; + get_page(dirty_page); + get_page_tails_for_fmap(dirty_page); + } + } + set_pmd_at(mm, address, huge_table, hentry); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); + /* no need to invalidate: a not-present page won't be + * cached */ + update_mmu_cache(vma, address, page_table); + } else { + if (cow_page) + mem_cgroup_uncharge_page(cow_page); + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ + } + spin_unlock(ptl); } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ - } + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if FAULT_FLAG_WRITE is set, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (anon) { + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; + get_page(dirty_page); + } + } + set_pte_at(mm, address, page_table, entry); - pte_unmap_unlock(page_table, ptl); + /* no need to invalidate: a not-present page won't be + * cached */ + update_mmu_cache(vma, address, page_table); + } else { + if (cow_page) + mem_cgroup_uncharge_page(cow_page); + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ + } + pte_unmap_unlock(page_table, ptl); + } if (dirty_page) { struct address_space *mapping = page->mapping; - if (set_page_dirty(dirty_page)) - page_mkwrite = 1; - unlock_page(dirty_page); + if (pud) { + int dirtied; + dirtied = set_page_dirty_huge(dirty_page); + unlock_page(dirty_page); + if (dirtied) + page_mkwrite = dirtied; + } else { + if (set_page_dirty(dirty_page)) + page_mkwrite = 1; + unlock_page(dirty_page); + } + + if (pud) { + put_page_tails_for_fmap(dirty_page); + compound_put(page); + } + put_page(dirty_page); if (page_mkwrite && mapping) { /* * Some device drivers do not set page.mapping but still * dirty their pages */ - balance_dirty_pages_ratelimited(mapping); + balance_dirty_pages_ratelimited_nr(mapping, + page_mkwrite); } /* file_update_time outside page_lock */ @@ -3328,6 +3563,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, file_update_time(vma->vm_file); } else { unlock_page(vmf.page); + if (pud) + compound_put(page); if (anon) page_cache_release(vmf.page); } @@ -3335,6 +3572,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; unwritable_page: + if (pud) { + compound_put(page); + put_page_tails_for_fmap(page); + } page_cache_release(page); return ret; uncharge_out: @@ -3346,6 +3587,33 @@ uncharge_out: return ret; } +/** Facade for {@link __do_fault} to fault "huge" pages. + * GCC will strip unneeded code basing on parameters passed. + */ +static int __do_fault_huge(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pud_t *pud, + pgoff_t pgoff, unsigned int flags, + pmd_t orig_pmd) +{ + pte_t pte_any; + return __do_fault( + mm, vma, address, pud, NULL, pgoff, flags, orig_pmd, pte_any); +} + +/** Facade for {@link __do_fault} to fault "normal", pte level pages. + * GCC will strip unneeded code basing on parameters passed. + */ +static int __do_fault_normal(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + pmd_t pmd_any; + return __do_fault( + mm, vma, address, NULL, pmd, pgoff, flags, pmd_any, orig_pte); +} + static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3354,7 +3622,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3386,7 +3654,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3455,6 +3723,105 @@ unlock: return 0; } +/** Handles fault on pde level.*/ +int handle_pmd_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pud_t *pud, pmd_t *pmd, unsigned int flags) +{ + pte_t *pte; + pgoff_t pgoff; + pmd_t pmdVal; + int faultResult; + + if (!vma->vm_file) { + /* Anonymous THP handling */ + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) { + return do_huge_pmd_anonymous_page(mm, vma, + address, pmd, flags); + } + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, + address, pmd, orig_pmd); + return 0; + } + goto handle_pte_level; + } + } + /*************************** + * Page cache THP handling * + ***************************/ + pmdVal = *pmd; + if (pmd_present(pmdVal) && !pmd_trans_huge(pmdVal)) + goto handle_pte_level; + + if ((address & HPAGE_MASK) < vma->vm_start) + goto handle_pte_level; + + /* Even if possible we currently support only for SHARED VMA. + * + * We support this only for shmem fs, but everyone is encorege + * to add few simple methods and test it for other file systems. + * Notes, warrnings etc are always welcome. + */ + if (!(vma->vm_flags & VM_SHARED)) + goto handle_pte_level; + + /* Handle fault of possible vma with huge page. */ + pgoff = (((address & HPAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + + if (!pmd_present(pmdVal)) { + /* No page at all. */ + if (!check_if_hugemapping_is_possible0(vma, address, pgoff, + pmdVal)) + goto handle_pte_level; + } else { + /* TODO Jump to make page writable. If not for regular + * filesystems, full fault path will be reused. + */ + } + + faultResult = __do_fault_huge(mm, vma, address, pud, pgoff, flags, + pmdVal); + if (!(faultResult & (VM_FAULT_ERROR | VM_FAULT_NOHUGE))) { + printk(KERN_INFO "Setted huge pmd"); + return faultResult; + } + +handle_pte_level: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + return VM_FAULT_OOM; + /* Page cache THP uses mm->page_table_lock to check if pmd is still + * none just before setting ne huge pmd, is __pte_alloc suceeded + * then pmd may be huge or "normal" with ptes page. + * + * if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + /* * By the time we get here, we already hold the mm semaphore */ @@ -3464,7 +3831,6 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte; __set_current_state(TASK_RUNNING); @@ -3484,42 +3850,8 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); - } else { - pmd_t orig_pmd = *pmd; - barrier(); - if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); - return 0; - } - } - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + return handle_pmd_fault(mm, vma, address, pud, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 363ba70..ff32b5d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2072,6 +2072,37 @@ int set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); +int set_page_dirty_huge(struct page *head) +{ + struct page *work; + int result = 0; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(!PageLocked(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1); + + if (head->mapping) + result += set_page_dirty(head); + else + BUG_ON(!PageSplitDeque(head)); + + for (work = head+1; compound_head(work) == head; work++) { + VM_BUG_ON(page_has_private(work)); + VM_BUG_ON(page_has_buffers(work)); + + lock_page(work); + if (work->mapping) { + result += set_page_dirty(work); + } else { + /* Bug if there is no mapping and split is not + * dequeued. + */ + BUG_ON(!PageSplitDeque(head)); + } + unlock_page(work); + } + return result; +} /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another diff --git a/mm/rmap.c b/mm/rmap.c index c8454e0..11f54e0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1157,6 +1157,21 @@ void page_add_file_rmap(struct page *page) } } +void page_add_file_rmap_huge(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1); + + page_add_file_rmap(head); + page_add_file_rmap(head + 1); + if (likely(compound_order(head) > 1)) { + for (page = head+2; page->__first_page == head; page++) + page_add_file_rmap(page); + } +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -1207,6 +1222,20 @@ void page_remove_rmap(struct page *page) */ } +void page_remove_rmap_huge(struct page *head) +{ + struct page *page; + + VM_BUG_ON(!PageHead(head)); + VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1); + + page_remove_rmap(head); + page_remove_rmap(head + 1); + if (likely(compound_order(head) > 1)) { + for (page = head+2; page->__first_page == head; page++) + page_remove_rmap(page); + } +} /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html