Hi Rik, Our linux-tux3 tree currently currently carries this 652 line diff against core, to make Tux3 work. This is mainly by Hirofumi, except the fs-writeback.c hook, which is by me. The main part you may be interested in is rmap.c, which addresses the issues raised at the 2013 Linux Storage Filesystem and MM Summit 2015 in San Francisco.[1] LSFMM: Page forking http://lwn.net/Articles/548091/ This is just a FYI. An upcoming Tux3 report will be a tour of the page forking design and implementation. For now, this is just to give a general sense of what we have done. We heard there are concerns about how ptrace will work. I really am not familiar with the issue, could you please explain what you were thinking of there? Enjoy, Daniel [1] Which happened to be a 15 minute bus ride away from me at the time. diffstat tux3.core.patch fs/Makefile | 1 fs/fs-writeback.c | 100 +++++++++++++++++++++++++-------- include/linux/fs.h | 6 + include/linux/mm.h | 5 + include/linux/pagemap.h | 2 include/linux/rmap.h | 14 ++++ include/linux/writeback.h | 23 +++++++ mm/filemap.c | 82 +++++++++++++++++++++++++++ mm/rmap.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++ mm/truncate.c | 98 ++++++++++++++++++++------------ 10 files changed, 411 insertions(+), 59 deletions(-) diff --git a/fs/Makefile b/fs/Makefile index 91fcfa3..44d7192 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -70,7 +70,6 @@ obj-$(CONFIG_EXT4_FS) += ext4/ obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_TUX3) += tux3/ -obj-$(CONFIG_TUX3_MMAP) += tux3/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ obj-y += ramfs/ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5..fcd1c61 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -34,25 +34,6 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) -/* - * Passed into wb_writeback(), essentially a subset of writeback_control - */ -struct wb_writeback_work { - long nr_pages; - struct super_block *sb; - unsigned long *older_than_this; - enum writeback_sync_modes sync_mode; - unsigned int tagged_writepages:1; - unsigned int for_kupdate:1; - unsigned int range_cyclic:1; - unsigned int for_background:1; - unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ - enum wb_reason reason; /* why was writeback initiated? */ - - struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ -}; - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -192,6 +173,36 @@ void inode_wb_list_del(struct inode *inode) } /* + * Remove inode from writeback list if clean. + */ +void inode_writeback_done(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_DIRTY)) + list_del_init(&inode->i_wb_list); + spin_unlock(&inode->i_lock); + spin_unlock(&bdi->wb.list_lock); +} +EXPORT_SYMBOL_GPL(inode_writeback_done); + +/* + * Add inode to writeback dirty list with current time. + */ +void inode_writeback_touch(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); + inode->dirtied_when = jiffies; + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&bdi->wb.list_lock); +} +EXPORT_SYMBOL_GPL(inode_writeback_touch); + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -610,9 +621,9 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, * * Return the number of pages and/or inodes written. */ -static long writeback_sb_inodes(struct super_block *sb, - struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long generic_writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { struct writeback_control wbc = { .sync_mode = work->sync_mode, @@ -727,6 +738,22 @@ static long writeback_sb_inodes(struct super_block *sb, return wrote; } +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + if (sb->s_op->writeback) { + long ret; + + spin_unlock(&wb->list_lock); + ret = sb->s_op->writeback(sb, wb, work); + spin_lock(&wb->list_lock); + return ret; + } + + return generic_writeback_sb_inodes(sb, wb, work); +} + static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) { @@ -1293,6 +1320,35 @@ static void wait_sb_inodes(struct super_block *sb) } /** + * writeback_queue_work_sb - schedule writeback work from given super_block + * @sb: the superblock + * @work: work item to queue + * + * Schedule writeback work on this super_block. This usually used to + * interact with sb->s_op->writeback callback. The caller must + * guarantee to @work is not freed while bdi flusher is using (for + * example, be safe against umount). + */ +void writeback_queue_work_sb(struct super_block *sb, + struct wb_writeback_work *work) +{ + if (sb->s_bdi == &noop_backing_dev_info) + return; + + /* Allow only following fields to use. */ + *work = (struct wb_writeback_work){ + .sb = sb, + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .done = work->done, + .nr_pages = work->nr_pages, + .reason = work->reason, + }; + bdi_queue_work(sb->s_bdi, work); +} +EXPORT_SYMBOL(writeback_queue_work_sb); + +/** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13..29833d2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,8 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); + void (*truncatepage)(struct address_space *, struct page *, + unsigned int, unsigned int, int); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); @@ -1590,6 +1592,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); +struct bdi_writeback; +struct wb_writeback_work; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); @@ -1599,6 +1603,8 @@ struct super_operations { int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); + long (*writeback)(struct super_block *super, struct bdi_writeback *wb, + struct wb_writeback_work *work); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *); int (*freeze_fs) (struct super_block *); diff --git a/include/linux/mm.h b/include/linux/mm.h index dd5ea30..075f59f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1909,6 +1909,11 @@ vm_unmapped_area(struct vm_unmapped_area_info *info) } /* truncate.c */ +void generic_truncate_partial_page(struct address_space *mapping, + struct page *page, unsigned int start, + unsigned int len); +void generic_truncate_full_page(struct address_space *mapping, + struct page *page, int wait); extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f..13b70160 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -653,6 +653,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +int cow_replace_page_cache(struct page *oldpage, struct page *newpage); +void cow_delete_from_page_cache(struct page *page); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e..9b67360 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -228,6 +228,20 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); int page_mkclean(struct page *); /* + * Make clone page for page forking. + * + * Note: only clones page state so other state such as buffer_heads + * must be cloned by caller. + */ +struct page *cow_clone_page(struct page *oldpage); + +/* + * Changes the PTES of shared mappings except the PTE in orig_vma. + */ +int page_cow_file(struct vm_area_struct *orig_vma, struct page *oldpage, + struct page *newpage); + +/* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0004833..0784b9d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -59,6 +59,25 @@ enum wb_reason { }; /* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_work { + long nr_pages; + struct super_block *sb; + unsigned long *older_than_this; + enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + enum wb_reason reason; /* why was writeback initiated? */ + + struct list_head list; /* pending work list */ + struct completion *done; /* set if the caller waits */ +}; + +/* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised * in a manner such that unspecified fields are set to zero. @@ -90,6 +109,10 @@ struct writeback_control { * fs/fs-writeback.c */ struct bdi_writeback; +void inode_writeback_done(struct inode *inode); +void inode_writeback_touch(struct inode *inode); +void writeback_queue_work_sb(struct super_block *sb, + struct wb_writeback_work *work); void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); diff --git a/mm/filemap.c b/mm/filemap.c index 673e458..8c641d0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -639,6 +639,88 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); +/* + * Atomically replace oldpage with newpage. + * + * Similar to migrate_pages(), but the oldpage is for writeout. + */ +int cow_replace_page_cache(struct page *oldpage, struct page *newpage) +{ + struct address_space *mapping = oldpage->mapping; + void **pslot; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + + /* Get refcount for radix-tree */ + page_cache_get(newpage); + + /* Replace page in radix tree. */ + spin_lock_irq(&mapping->tree_lock); + /* PAGECACHE_TAG_DIRTY represents the view of frontend. Clear it. */ + if (PageDirty(oldpage)) + radix_tree_tag_clear(&mapping->page_tree, page_index(oldpage), + PAGECACHE_TAG_DIRTY); + /* The refcount to newpage is used for radix tree. */ + pslot = radix_tree_lookup_slot(&mapping->page_tree, oldpage->index); + radix_tree_replace_slot(pslot, newpage); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_migrate(oldpage, newpage, true); + + /* Release refcount for radix-tree */ + page_cache_release(oldpage); + + return 0; +} +EXPORT_SYMBOL_GPL(cow_replace_page_cache); + +/* + * Delete page from radix-tree, leaving page->mapping unchanged. + * + * Similar to delete_from_page_cache(), but the deleted page is for writeout. + */ +void cow_delete_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + /* Delete page from radix tree. */ + spin_lock_irq(&mapping->tree_lock); + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_invalidate_page(mapping, page); + + page_cache_tree_delete(mapping, page, NULL); +#if 0 /* FIXME: backend is assuming page->mapping is available */ + page->mapping = NULL; +#endif + /* Leave page->index set: truncation lookup relies upon it */ + + __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); + + /* + * The following dirty accounting is done by writeback + * path. So, we don't need to do here. + * + * dec_zone_page_state(page, NR_FILE_DIRTY); + * dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + */ + spin_unlock_irq(&mapping->tree_lock); + + page_cache_release(page); +} +EXPORT_SYMBOL_GPL(cow_delete_from_page_cache); + #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { diff --git a/mm/rmap.c b/mm/rmap.c index 71cd5bd..9125246 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -923,6 +923,145 @@ int page_mkclean(struct page *page) } EXPORT_SYMBOL_GPL(page_mkclean); +/* + * Make clone page for page forking. (Based on migrate_page_copy()) + * + * Note: only clones page state so other state such as buffer_heads + * must be cloned by caller. + */ +struct page *cow_clone_page(struct page *oldpage) +{ + struct address_space *mapping = oldpage->mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; + struct page *newpage = __page_cache_alloc(gfp_mask); + int cpupid; + + newpage->mapping = oldpage->mapping; + newpage->index = oldpage->index; + copy_highpage(newpage, oldpage); + + /* FIXME: right? */ + BUG_ON(PageSwapCache(oldpage)); + BUG_ON(PageSwapBacked(oldpage)); + BUG_ON(PageHuge(oldpage)); + if (PageError(oldpage)) + SetPageError(newpage); + if (PageReferenced(oldpage)) + SetPageReferenced(newpage); + if (PageUptodate(oldpage)) + SetPageUptodate(newpage); + if (PageActive(oldpage)) + SetPageActive(newpage); + if (PageMappedToDisk(oldpage)) + SetPageMappedToDisk(newpage); + + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(oldpage, -1); + page_cpupid_xchg_last(newpage, cpupid); + + mlock_migrate_page(newpage, oldpage); + ksm_migrate_page(newpage, oldpage); + + /* Lock newpage before visible via radix tree */ + BUG_ON(PageLocked(newpage)); + __set_page_locked(newpage); + + return newpage; +} +EXPORT_SYMBOL_GPL(cow_clone_page); + +static int page_cow_one(struct page *oldpage, struct page *newpage, + struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t oldptval, ptval, *pte; + spinlock_t *ptl; + int ret = 0; + + pte = page_check_address(oldpage, mm, address, &ptl, 1); + if (!pte) + goto out; + + flush_cache_page(vma, address, pte_pfn(*pte)); + oldptval = ptep_clear_flush(vma, address, pte); + + /* Take refcount for PTE */ + page_cache_get(newpage); + + /* + * vm_page_prot doesn't have writable bit, so page fault will + * be occurred immediately after returned from this page fault + * again. And second time of page fault will be resolved with + * forked page was set here. + */ + ptval = mk_pte(newpage, vma->vm_page_prot); +#if 0 + /* FIXME: we should check following too? Otherwise, we would + * get additional read-only => write fault at least */ + if (pte_write) + ptval = pte_mkwrite(ptval); + if (pte_dirty(oldptval)) + ptval = pte_mkdirty(ptval); + if (pte_young(oldptval)) + ptval = pte_mkyoung(ptval); +#endif + set_pte_at(mm, address, pte, ptval); + + /* Update rmap accounting */ + BUG_ON(!PageMlocked(oldpage)); /* Caller should migrate mlock flag */ + page_remove_rmap(oldpage); + page_add_file_rmap(newpage); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); + + pte_unmap_unlock(pte, ptl); + + mmu_notifier_invalidate_page(mm, address); + + /* Release refcount for PTE */ + page_cache_release(oldpage); +out: + return ret; +} + +/* Change old page in PTEs to new page exclude orig_vma */ +int page_cow_file(struct vm_area_struct *orig_vma, struct page *oldpage, + struct page *newpage) +{ + struct address_space *mapping = page_mapping(oldpage); + pgoff_t pgoff = oldpage->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + int ret = 0; + + BUG_ON(!PageLocked(oldpage)); + BUG_ON(!PageLocked(newpage)); + BUG_ON(PageAnon(oldpage)); + BUG_ON(mapping == NULL); + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + /* + * The orig_vma's PTE is handled by caller. + * (e.g. ->page_mkwrite) + */ + if (vma == orig_vma) + continue; + + if (vma->vm_flags & VM_SHARED) { + unsigned long address = vma_address(oldpage, vma); + ret += page_cow_one(oldpage, newpage, vma, address); + } + } + i_mmap_unlock_read(mapping); + + return ret; +} +EXPORT_SYMBOL_GPL(page_cow_file); + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d60..e5b4673 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -216,6 +216,56 @@ int invalidate_inode_page(struct page *page) return invalidate_complete_page(mapping, page); } +void generic_truncate_partial_page(struct address_space *mapping, + struct page *page, unsigned int start, + unsigned int len) +{ + wait_on_page_writeback(page); + zero_user_segment(page, start, start + len); + if (page_has_private(page)) + do_invalidatepage(page, start, len); +} +EXPORT_SYMBOL(generic_truncate_partial_page); + +static void truncate_partial_page(struct address_space *mapping, pgoff_t index, + unsigned int start, unsigned int len) +{ + struct page *page = find_lock_page(mapping, index); + if (!page) + return; + + if (!mapping->a_ops->truncatepage) + generic_truncate_partial_page(mapping, page, start, len); + else + mapping->a_ops->truncatepage(mapping, page, start, len, 1); + + cleancache_invalidate_page(mapping, page); + unlock_page(page); + page_cache_release(page); +} + +void generic_truncate_full_page(struct address_space *mapping, + struct page *page, int wait) +{ + if (wait) + wait_on_page_writeback(page); + else if (PageWriteback(page)) + return; + + truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_truncate_full_page); + +static void truncate_full_page(struct address_space *mapping, struct page *page, + int wait) +{ + if (!mapping->a_ops->truncatepage) + generic_truncate_full_page(mapping, page, wait); + else + mapping->a_ops->truncatepage(mapping, page, 0, PAGE_CACHE_SIZE, + wait); +} + /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -298,11 +348,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; WARN_ON(page->index != index); - if (PageWriteback(page)) { - unlock_page(page); - continue; - } - truncate_inode_page(mapping, page); + truncate_full_page(mapping, page, 0); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -312,37 +358,18 @@ void truncate_inode_pages_range(struct address_space *mapping, } if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_CACHE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - page_cache_release(page); - } - } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - page_cache_release(page); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + /* Truncation within a single page */ + top = partial_end; + partial_end = 0; } + truncate_partial_page(mapping, start - 1, partial_start, + top - partial_start); } + if (partial_end) + truncate_partial_page(mapping, end, 0, partial_end); + /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. @@ -386,8 +413,7 @@ void truncate_inode_pages_range(struct address_space *mapping, lock_page(page); WARN_ON(page->index != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); + truncate_full_page(mapping, page, 1); unlock_page(page); } pagevec_remove_exceptionals(&pvec); -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html