The patch titled mm: tracking shared dirty pages has been added to the -mm tree. Its filename is mm-tracking-shared-dirty-pages.patch See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find out what to do about this ------------------------------------------------------ Subject: mm: tracking shared dirty pages From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- fs/buffer.c | 2 - include/linux/mm.h | 6 +++ include/linux/rmap.h | 8 ++++ mm/memory.c | 31 +++++++++++++++--- mm/mmap.c | 34 ++++++++++++++++---- mm/mprotect.c | 7 +++- mm/page-writeback.c | 2 + mm/rmap.c | 68 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 144 insertions(+), 14 deletions(-) diff -puN fs/buffer.c~mm-tracking-shared-dirty-pages fs/buffer.c --- a/fs/buffer.c~mm-tracking-shared-dirty-pages +++ a/fs/buffer.c @@ -2983,6 +2983,7 @@ int try_to_free_buffers(struct page *pag spin_lock(&mapping->private_lock); ret = drop_buffers(page, &buffers_to_free); + spin_unlock(&mapping->private_lock); if (ret) { /* * If the filesystem writes its buffers by hand (eg ext3) @@ -2994,7 +2995,6 @@ int try_to_free_buffers(struct page *pag */ clear_page_dirty(page); } - spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff -puN include/linux/mm.h~mm-tracking-shared-dirty-pages include/linux/mm.h --- a/include/linux/mm.h~mm-tracking-shared-dirty-pages +++ a/include/linux/mm.h @@ -181,6 +181,12 @@ extern unsigned int kobjsize(const void #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +static inline int is_shared_writable(unsigned int flags) +{ + return (flags & (VM_SHARED|VM_WRITE|VM_PFNMAP)) == + (VM_SHARED|VM_WRITE); +} + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff -puN include/linux/rmap.h~mm-tracking-shared-dirty-pages include/linux/rmap.h --- a/include/linux/rmap.h~mm-tracking-shared-dirty-pages +++ a/include/linux/rmap.h @@ -103,6 +103,14 @@ pte_t *page_check_address(struct page *, */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +/* + * Cleans the PTEs of shared mappings. + * (and since clean PTEs should also be readonly, write protects them too) + * + * returns the number of cleaned PTEs. + */ +int page_mkclean(struct page *); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) diff -puN mm/memory.c~mm-tracking-shared-dirty-pages mm/memory.c --- a/mm/memory.c~mm-tracking-shared-dirty-pages +++ a/mm/memory.c @@ -937,6 +937,12 @@ struct page *follow_page(struct vm_area_ pte = *ptep; if (!pte_present(pte)) goto unlock; + /* + * This is not fully correct in the light of trapping write faults + * for writable shared mappings. However since we're going to mark + * the page dirty anyway some few lines downward, we might as well + * take the write fault now. + */ if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); @@ -1457,14 +1463,15 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; pte_t entry; - int reuse, ret = VM_FAULT_MINOR; + int reuse = 0; + int ret = VM_FAULT_MINOR; + struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == - (VM_SHARED|VM_WRITE))) { + if (unlikely(is_shared_writable(vma->vm_flags))) { if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to @@ -1493,13 +1500,12 @@ static int do_wp_page(struct mm_struct * if (!pte_same(*page_table, orig_pte)) goto unlock; } - + dirty_page = old_page; + get_page(dirty_page); reuse = 1; } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); - } else { - reuse = 0; } if (reuse) { @@ -1565,6 +1571,10 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return ret; oom: if (old_page) @@ -2094,6 +2104,7 @@ static int do_no_page(struct mm_struct * unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); @@ -2188,6 +2199,10 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) { + dirty_page = new_page; + get_page(dirty_page); + } } } else { /* One of our sibling threads was faster, back out. */ @@ -2200,6 +2215,10 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return ret; oom: page_cache_release(new_page); diff -puN mm/mmap.c~mm-tracking-shared-dirty-pages mm/mmap.c --- a/mm/mmap.c~mm-tracking-shared-dirty-pages +++ a/mm/mmap.c @@ -25,6 +25,7 @@ #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -888,6 +889,7 @@ unsigned long do_mmap_pgoff(struct file struct rb_node ** rb_link, * rb_parent; int accountable = 1; unsigned long charged = 0, reqprot = prot; + struct address_space *mapping = NULL; if (file) { if (is_file_hugepages(file)) @@ -1084,18 +1086,13 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; + } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } - /* Don't make the VMA automatically writable if it's shared, but the - * backer wishes to know when pages are first written to */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) - vma->vm_page_prot = - protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) * that memory reservation must be checked; but that reservation @@ -1113,6 +1110,31 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; + /* + * Tracking of dirty pages for shared writable mappings. Do this by + * write protecting writable pages, and mark dirty in the write fault. + * + * Modify vma->vm_page_prot (the default protection for new pages) + * to this effect. + * + * Cannot do before because the condition depends on: + * - backing_dev_info having the right capabilities + * (set by f_op->open()) + * - vma->vm_flags being fully set + * (finished in f_op->mmap(), which could call remap_pfn_range()) + * + * Also, cannot reset vma->vm_page_prot from vma->vm_flags because + * f_op->mmap() can modify it. + */ + if (is_shared_writable(vm_flags) && vma->vm_file) + mapping = vma->vm_file->f_mapping; + if ((mapping && mapping_cap_account_dirty(mapping)) || + (vma->vm_ops && vma->vm_ops->page_mkwrite)) + vma->vm_page_prot = + __pgprot(pte_val + (pte_wrprotect + (__pte(pgprot_val(vma->vm_page_prot))))); + if (!file || !vma_merge(mm, prev, addr, vma->vm_end, vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { file = vma->vm_file; diff -puN mm/mprotect.c~mm-tracking-shared-dirty-pages mm/mprotect.c --- a/mm/mprotect.c~mm-tracking-shared-dirty-pages +++ a/mm/mprotect.c @@ -21,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -124,6 +125,7 @@ mprotect_fixup(struct vm_area_struct *vm long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; unsigned int mask; + struct address_space *mapping = NULL; pgprot_t newprot; pgoff_t pgoff; int error; @@ -179,7 +181,10 @@ success: /* Don't make the VMA automatically writable if it's shared, but the * backer wishes to know when pages are first written to */ mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; - if (vma->vm_ops && vma->vm_ops->page_mkwrite) + if (is_shared_writable(newflags) && vma->vm_file) + mapping = vma->vm_file->f_mapping; + if ((mapping && mapping_cap_account_dirty(mapping)) || + (vma->vm_ops && vma->vm_ops->page_mkwrite)) mask &= ~VM_SHARED; newprot = protection_map[newflags & mask]; diff -puN mm/page-writeback.c~mm-tracking-shared-dirty-pages mm/page-writeback.c --- a/mm/page-writeback.c~mm-tracking-shared-dirty-pages +++ a/mm/page-writeback.c @@ -29,6 +29,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/rmap.h> /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -759,6 +760,7 @@ int clear_page_dirty_for_io(struct page if (mapping) { if (TestClearPageDirty(page)) { + page_mkclean(page); if (mapping_cap_account_dirty(mapping)) dec_page_state(nr_dirty); return 1; diff -puN mm/rmap.c~mm-tracking-shared-dirty-pages mm/rmap.c --- a/mm/rmap.c~mm-tracking-shared-dirty-pages +++ a/mm/rmap.c @@ -53,6 +53,7 @@ #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <asm/tlbflush.h> @@ -434,6 +435,73 @@ int page_referenced(struct page *page, i return referenced; } +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, int protect) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte, entry; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; + + if (!(pte_dirty(*pte) || (protect && pte_write(*pte)))) + goto unlock; + + entry = ptep_get_and_clear(mm, address, pte); + entry = pte_mkclean(entry); + if (protect) + entry = pte_wrprotect(entry); + ptep_establish(vma, address, pte, entry); + lazy_mmu_prot_update(entry); + ret = 1; + +unlock: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_mkclean_file(struct address_space *mapping, struct page *page) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = 0; + + BUG_ON(PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + int protect = mapping_cap_account_dirty(mapping) && + is_shared_writable(vma->vm_flags); + ret += page_mkclean_one(page, vma, protect); + } + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int page_mkclean(struct page *page) +{ + int ret = 0; + + BUG_ON(!PageLocked(page)); + + if (page_mapped(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + ret = page_mkclean_file(mapping, page); + } + + return ret; +} + /** * page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to _ Patches currently in -mm which might be from a.p.zijlstra@xxxxxxxxx are buglet-in-radix_tree_tag_set.patch add-page_mkwrite-vm_operations-method-fix.patch mm-tracking-shared-dirty-pages.patch mm-balance-dirty-pages.patch mm-msync-cleanup.patch mm-optimize-the-new-mprotect-code-a-bit.patch mm-small-cleanup-of-install_page.patch mm-remove-some-update_mmu_cache-calls.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html