Hi, I've always liked the idea of being able to do writeout directly based on block number, rather than the valiant but doomed-to-be-suboptimal heuristics that our current dirty writeout system does, as it is running above the pagecache and doesn't know about poor file layouts, or interaction between multiple files or file and metadata. My idea for block based writeout is to have dirty pagecache get inserted into a data structure based on its (block device, block number), and sorted in order of block number for a given block device. You then have a kernel thread per block device, bdflush, which does the background writeout in order. There are minimal heuristics involved here, best is probably just submit blocks before a particular age in a one-way elevator order (which my patch doesn't quite get right). But it would be easy to do things like detect idle block devices to kick off writeout sooner, or write out _all_ dirty blocks in ascending order (which my patch does -- this trades off timeliness for throughput). I haven't worked out what to do about delayed allocation filesystems, exactly. Perhaps blocks without mappings yet could be given some default heuristic by the filesystem for placement in the dirty data structure, or else some kind of scheme to drop into a scan-by-logical-offset mode to do the allocations contiguously before resuming block based writeout. I haven't thought that far ahead ;) Simple patch here to implement this against fsblock. Sorry for an experimental patch on top of an experimental patch... I seem to need some features in fsblock but not buffer.c to make this work nicely (eg. tighter coupling of dirty state between block and page). Patch is pretty experimental, and has a known use-after-free bug on unmount (I'm not going to get much more time for hacking between now and vm/fs meetup, and I'd like to be able to discuss this if people think it is interesting, so I had to rush). Mainly attached as proof of concept, which I don't expect anybody is actually going to use. But comments on the concept would be welcome. I wonder why there hasn't been much noise about doing something like this -- are there some known/obvious problems? The data structure I'm using is a per-block_device rbtree for simplicity now. I perhaps forsee scalability issues with locking, so it may be more appropriate to go with a radix-tree or b-tree or something with a more scalable write-side. I don't think this should be a showstopper, though. Thanks, Nick --- Index: linux-2.6/fs/fsblock.c =================================================================== --- linux-2.6.orig/fs/fsblock.c +++ linux-2.6/fs/fsblock.c @@ -21,6 +21,9 @@ #include <linux/gfp.h> #include <linux/cache.h> #include <linux/profile.h> +#include <linux/rbtree.h> +#include <linux/kthread.h> +#include <linux/delay.h> //#include <linux/buffer_head.h> /* too much crap in me */ extern int try_to_free_buffers(struct page *); @@ -45,6 +48,15 @@ void __init fsblock_init(void) SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD, NULL); } +static void clear_block_dirty(struct fsblock *block); + +static int test_and_set_block_dirty(struct fsblock *block); + +static void set_block_dirty(struct fsblock *block) +{ + test_and_set_block_dirty(block); +} + static void init_block(struct page *page, struct fsblock *block, unsigned int bits) { block->flags = 0; @@ -472,14 +484,14 @@ int fsblock_set_page_dirty(struct page * for_each_block(block, b) { FSB_BUG_ON(!test_bit(BL_uptodate, &b->flags)); if (!test_bit(BL_dirty, &b->flags)) { - set_bit(BL_dirty, &b->flags); + set_block_dirty(b); ret = 1; } } } else { FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); if (!test_bit(BL_dirty, &block->flags)) { - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); ret = 1; } } @@ -1329,6 +1341,37 @@ static int write_block(struct fsblock *b return submit_block(block, WRITE); } +static int fsblock_test_clear_dirty(struct fsblock *block) +{ + int ret = 0; + struct page *page = block->page; + + if (test_bit(BL_dirty, &block->flags)) { + FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); + clear_block_dirty(block); + ret = 1; + + if (fsblock_subpage(block)) { + struct fsblock *b; + for_each_block(page_blocks(page), b) { + if (test_bit(BL_dirty, &b->flags)) + goto page_dirty; + } + } + if (!fsblock_superpage(block)) { + ret = clear_page_dirty_for_io(page); + FSB_BUG_ON(!ret); + } else { + struct page *p; + for_each_page(page, fsblock_size(block), p) { + clear_page_dirty_for_io(p); + } end_for_each_page; + } + } +page_dirty: + return ret; +} + int sync_block(struct fsblock *block) { int ret = 0; @@ -1339,28 +1382,8 @@ int sync_block(struct fsblock *block) iolock_block(block); wait_on_block_writeback(block); - FSB_BUG_ON(PageWriteback(page)); /* because block is locked */ - if (test_bit(BL_dirty, &block->flags)) { - FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); - clear_bit(BL_dirty, &block->flags); - - if (fsblock_subpage(block)) { - struct fsblock *b; - for_each_block(page_blocks(page), b) { - if (test_bit(BL_dirty, &b->flags)) - goto page_dirty; - } - } - if (!fsblock_superpage(block)) { - ret = clear_page_dirty_for_io(page); - FSB_BUG_ON(!ret); - } else { - struct page *p; - for_each_page(page, fsblock_size(block), p) { - clear_page_dirty_for_io(p); - } end_for_each_page; - } -page_dirty: + FSB_BUG_ON(PageWriteback(page)); + if (fsblock_test_clear_dirty(block)) { set_block_writeback(block); ret = write_block(block); @@ -1398,7 +1421,7 @@ int mark_mblock_dirty(struct fsblock_met FSB_BUG_ON(!fsblock_superpage(&mblock->block) && !test_bit(BL_uptodate, &mblock->block.flags)); - if (test_and_set_bit(BL_dirty, &mblock->block.flags)) + if (test_and_set_block_dirty(&mblock->block)) return 0; page = mblock_block(mblock)->page; @@ -1499,7 +1522,7 @@ int fsblock_sync(struct address_space *m wait_on_block_writeback(block); if (test_bit(BL_dirty, &block->flags)) { FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); - clear_bit(BL_dirty, &block->flags); + clear_block_dirty(block); ret = write_block(block); if (ret && !err) err = ret; @@ -1600,7 +1623,7 @@ static void sync_underlying_metadata(str if (!err) FSB_BUG_ON(test_bit(BL_dirty, &meta_block->flags)); else { - clear_bit(BL_dirty, &meta_block->flags); + clear_block_dirty(meta_block); wait_on_block_iolock(meta_block); } } @@ -1611,12 +1634,29 @@ void mbforget(struct fsblock_meta *mbloc { struct fsblock *block = mblock_block(mblock); - /* - * XXX: could clear_bit(BL_dirty, &block->flags) here, but we'd also - * need to clear page dirty to remain in synch... and that's a bit - * hard to do without races. - */ - sync_block(block); + if (test_bit(BL_dirty, &block->flags)) { + struct page *page = block->page; + + clear_block_dirty(block); + + if (fsblock_subpage(block)) { + struct fsblock *b; + for_each_block(page_blocks(page), b) { + if (test_bit(BL_dirty, &b->flags)) + goto page_dirty; + } + } + if (!fsblock_superpage(block)) { + cancel_dirty_page(page, PAGE_CACHE_SIZE); + } else { + struct page *p; + for_each_page(page, fsblock_size(block), p) { + cancel_dirty_page(p, PAGE_CACHE_SIZE); + } end_for_each_page; + } + } +page_dirty: + wait_on_block_writeback(block); FSB_BUG_ON(test_bit(BL_dirty, &block->flags)); FSB_BUG_ON(test_bit(BL_writeback, &block->flags)); block_put(block); @@ -1634,7 +1674,8 @@ struct fsblock_meta *mbread(struct block iolock_block(block); if (!test_bit(BL_uptodate, &block->flags)) { int ret; - FSB_BUG_ON(PageWriteback(block->page)); + /* FSB_BUG_ON(PageWriteback(block->page)); */ + wait_on_block_writeback(block); FSB_BUG_ON(test_bit(BL_dirty, &block->flags)); set_block_sync_io(block); ret = read_block(block); @@ -1865,13 +1906,13 @@ static int block_write_helper(struct pag if (test_bit(BL_new, &block->flags)) { sync_underlying_metadata(block); clear_bit(BL_new, &block->flags); - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); } #endif if (test_bit(BL_dirty, &block->flags)) { FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); - clear_bit(BL_dirty, &block->flags); + clear_block_dirty(block); FSB_BUG_ON(test_bit(BL_readin, &block->flags)); FSB_BUG_ON(test_bit(BL_writeback, &block->flags)); set_bit(BL_writeback, &block->flags); @@ -2003,7 +2044,7 @@ int fsblock_write_page(struct page *page /* XXX: recheck ordering here! don't want to lose dirty bits */ - clear_bit(BL_dirty, &block->flags); + clear_block_dirty(block); ret = write_block(block); if (ret) goto out_unlock; @@ -2033,7 +2074,7 @@ static int block_dirty_helper(struct pag // XXX sync_underlying_metadata(block); FSB_BUG_ON(test_bit(BL_writeback, &block->flags)); clear_bit(BL_new, &block->flags); - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); __set_page_dirty_noblocks(page); return 0; } else if (test_bit(BL_uptodate, &block->flags)) @@ -2085,7 +2126,7 @@ static int fsblock_prepare_write_super(s // XXX sync_underlying_metadata(block); FSB_BUG_ON(test_bit(BL_writeback, &block->flags)); clear_bit(BL_new, &block->flags); - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); } else if (!test_bit(BL_uptodate, &block->flags)) { FSB_BUG_ON(test_bit(BL_dirty, &block->flags)); @@ -2201,7 +2242,7 @@ static int __fsblock_commit_write_super( struct page *page, *p; FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags)); - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); page = block->page; for_each_page(page, size, p) { FSB_BUG_ON(!PageUptodate(p)); @@ -2224,7 +2265,7 @@ static int __fsblock_commit_write_sub(st else set_bit(BL_uptodate, &block->flags); if (!test_bit(BL_dirty, &b->flags)) - set_bit(BL_dirty, &b->flags); + set_block_dirty(b); } if (to - from < PAGE_CACHE_SIZE) FSB_BUG_ON(!PageUptodate(page)); @@ -2262,7 +2303,7 @@ int __fsblock_commit_write(struct page * } if (!test_bit(BL_dirty, &block->flags)) - set_bit(BL_dirty, &block->flags); + set_block_dirty(block); __set_page_dirty_noblocks(page); return 0; @@ -2578,7 +2619,7 @@ static void invalidate_block(struct fsbl * via prepare_write failure */ clear_bit(BL_new, &block->flags); - clear_bit(BL_dirty, &block->flags); + clear_block_dirty(block); clear_bit(BL_uptodate, &block->flags); clear_bit(BL_mapped, &block->flags); block->block_nr = -1; @@ -2661,3 +2702,238 @@ int fsblock_file_mmap(struct file *file, return 0; } EXPORT_SYMBOL(fsblock_file_mmap); + +/*** block based writeout ***/ +struct fsblock_bd { + spinlock_t lock; + struct rb_root dirty_root; + unsigned long nr_dirty; + struct task_struct *bdflush; +}; + +static void fsblock_writeout_data(struct block_device *bdev) +{ + sector_t last_block_nr = (sector_t)-1ULL; + struct fsblock_bd *fbd; + unsigned long nr = 0; + + fbd = (struct fsblock_bd *)bdev->bd_private; + spin_lock(&fbd->lock); + while (!RB_EMPTY_ROOT(&fbd->dirty_root)) { + struct page *page; + struct rb_node *node; + struct fsblock *block; + unsigned long ptrdiff; + + FSB_BUG_ON(fbd->nr_dirty == 0); + + if (last_block_nr == (sector_t)-1ULL) { + node = rb_first(&fbd->dirty_root); + block = rb_entry(node, struct fsblock, block_node); + } else { + struct fsblock *tmp = NULL; + + node = fbd->dirty_root.rb_node; + do { + block = rb_entry(node, struct fsblock, block_node); + if (block->block_nr <= last_block_nr) + node = node->rb_right; + else { + tmp = block; + node = node->rb_left; + } + } while (node); + if (!tmp) { + if (nr) + printk("bdflush wrote %lu\n", nr); + nr = 0; + last_block_nr = (sector_t)-1ULL; + continue; + } + block = tmp; + } + last_block_nr = block->block_nr; + + page = block->page; + ptrdiff = block - page_blocks(page); + page_cache_get(page); + spin_unlock(&fbd->lock); + lock_page(page); + if (!PageBlocks(page) || block - page_blocks(page) != ptrdiff) { + unlock_page(page); + page_cache_release(page); + continue; + } + if (fsblock_superpage(block)) { + struct page *p; + for_each_page(page, fsblock_size(block), p) { + if (p == block->page) + continue; + lock_page(p); + } end_for_each_page; + } + wait_on_block_writeback(block); + FSB_BUG_ON(PageWriteback(page)); + if (fsblock_test_clear_dirty(block)) { + set_block_writeback(block); + write_block(block); + } else + iounlock_block(block); + nr++; + cond_resched(); + spin_lock(&fbd->lock); + } + if (nr) + printk("bdflush wrote %lu\n", nr); + if (fbd->nr_dirty != 0) { + printk("fbd->nr_dirty = %lu, RB_EMPTY_ROOT = %d\n", fbd->nr_dirty, RB_EMPTY_ROOT(&fbd->dirty_root)); + } + FSB_BUG_ON(fbd->nr_dirty != 0); + spin_unlock(&fbd->lock); + + if (!fbd->nr_dirty) { + msleep(1000); + } +} + +static int bdflush(void *arg) +{ + struct block_device *bdev = arg; + struct backing_dev_info *bdi; + + bdi = bdev->bd_inode_backing_dev_info; + if (!bdi) + bdi = bdev->bd_inode->i_mapping->backing_dev_info; + + printk("bdflush\n"); + while (!writeback_acquire(bdi)) { + printk("bdflush could not acquire bdi\n"); + cpu_relax(); + msleep(1000); + } + printk("bdflush starting\n"); + while (!kthread_should_stop()) { + fsblock_writeout_data(bdev); + } + printk("bdflush finished\n"); + + writeback_release(bdi); + + return 0; +} + +int fsblock_register_super(struct super_block *sb) +{ + struct fsblock_bd *fbd; + + printk("fsblock_register_super\n"); + + fbd = kmalloc(sizeof(struct fsblock_bd), GFP_KERNEL); + if (!fbd) + return -ENOMEM; + spin_lock_init(&fbd->lock); + fbd->dirty_root = RB_ROOT; + fbd->nr_dirty = 0; + fbd->bdflush = kthread_create(bdflush, sb->s_bdev, "bdflush"); + if (IS_ERR(fbd->bdflush)) { + int err = PTR_ERR(fbd->bdflush); + kfree(fbd); + return err; + } + sb->s_bdev->bd_private = (unsigned long)fbd; + wake_up_process(fbd->bdflush); + + return 0; +} + +void fsblock_unregister_super(struct super_block *sb) +{ + struct fsblock_bd *fbd; + + printk("fsblock_unregister_super\n"); + fbd = (struct fsblock_bd *)sb->s_bdev->bd_private; + kthread_stop(fbd->bdflush); + kfree(fbd); +} + +static void fbd_add_dirty_block(struct fsblock_bd *fbd, struct fsblock *block) +{ + struct rb_node **p = &fbd->dirty_root.rb_node; + struct rb_node *parent = NULL; + + spin_lock(&fbd->lock); + FSB_BUG_ON(!fbd->nr_dirty && !RB_EMPTY_ROOT(&fbd->dirty_root)); + FSB_BUG_ON(fbd->nr_dirty && RB_EMPTY_ROOT(&fbd->dirty_root)); + if (test_bit(BL_dirty_acct, &block->flags)) { + spin_unlock(&fbd->lock); + return; + } + while (*p != NULL) { + struct fsblock *tmp; + + parent = *p; + tmp = rb_entry(parent, struct fsblock, block_node); + + if (block->block_nr < tmp->block_nr) + p = &parent->rb_left; + else if (block->block_nr > tmp->block_nr) + p = &parent->rb_right; + else + FSB_BUG(); /* XXX: no alias avoidance so this may trigger */ + } + + rb_link_node(&block->block_node, parent, p); + rb_insert_color(&block->block_node, &fbd->dirty_root); + + fbd->nr_dirty++; + set_bit(BL_dirty_acct, &block->flags); + FSB_BUG_ON(RB_EMPTY_ROOT(&fbd->dirty_root)); + spin_unlock(&fbd->lock); +} + +static void fbd_del_dirty_block(struct fsblock_bd *fbd, struct fsblock *block) +{ + spin_lock(&fbd->lock); + if (!test_bit(BL_dirty, &block->flags)) { + FSB_BUG_ON(RB_EMPTY_NODE(&block->block_node)); + rb_erase(&block->block_node, &fbd->dirty_root); + RB_CLEAR_NODE(&block->block_node); + + FSB_BUG_ON(!test_bit(BL_dirty_acct, &block->flags)); + clear_bit(BL_dirty_acct, &block->flags); + FSB_BUG_ON(fbd->nr_dirty == 0); + fbd->nr_dirty--; + FSB_BUG_ON(!fbd->nr_dirty && !RB_EMPTY_ROOT(&fbd->dirty_root)); + FSB_BUG_ON(fbd->nr_dirty && RB_EMPTY_ROOT(&fbd->dirty_root)); + } + spin_unlock(&fbd->lock); +} + +static void clear_block_dirty(struct fsblock *block) +{ + int ret; + + ret = test_and_clear_bit(BL_dirty, &block->flags); + if (ret) { + struct fsblock_bd *fbd; + + fbd = (struct fsblock_bd *)mapping_data_bdev(block->page->mapping)->bd_private; + fbd_del_dirty_block(fbd, block); + } +} + +static int test_and_set_block_dirty(struct fsblock *block) +{ + int ret; + + ret = test_and_set_bit(BL_dirty, &block->flags); + if (!ret) { + struct fsblock_bd *fbd; + + fbd = (struct fsblock_bd *)mapping_data_bdev(block->page->mapping)->bd_private; + fbd_add_dirty_block(fbd, block); + } + + return ret; +} + Index: linux-2.6/fs/minix/inode.c =================================================================== --- linux-2.6.orig/fs/minix/inode.c +++ linux-2.6/fs/minix/inode.c @@ -55,6 +55,7 @@ static void minix_put_super(struct super mblock_put(sbi->s_smblock); kfree(sbi->s_imap); sb->s_fs_info = NULL; + fsblock_unregister_super(sb); kfree(sbi); return; @@ -160,15 +161,21 @@ static int minix_fill_super(struct super unsigned int size = BLOCK_SIZE; sector_t blocknr = BLOCK_SIZE / size; unsigned int offset = BLOCK_SIZE - blocknr * size; + int ret = -ENOMEM; sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL); if (!sbi) - return -ENOMEM; + return ret; s->s_fs_info = sbi; BUILD_BUG_ON(32 != sizeof (struct minix_inode)); BUILD_BUG_ON(64 != sizeof(struct minix2_inode)); + ret = fsblock_register_super(s); + if (ret) + goto out_bad_fsblock; + + ret = -EINVAL; if (!sb_set_blocksize(s, size)) goto out_bad_hblock; @@ -347,6 +354,10 @@ out_release: mblock_put(mblock); goto out; +out_bad_fsblock: + /* XXX: leaky */ + goto out; + out_bad_hblock: printk("MINIX-fs: blocksize too small for device\n"); goto out; Index: linux-2.6/include/linux/fsblock.h =================================================================== --- linux-2.6.orig/include/linux/fsblock.h +++ linux-2.6/include/linux/fsblock.h @@ -38,6 +38,8 @@ #define BL_vmapped 15 #endif +#define BL_dirty_acct 16 + /* * XXX: eventually want to replace BL_pagecache_io with synchronised block * and page flags manipulations (set_page_dirty of get_user_pages could be @@ -90,6 +92,15 @@ static inline int size_is_superpage(unsi #endif } +static inline int size_is_subpage(unsigned int size) +{ +#ifdef BLOCK_SUPERPAGE_SUPPORT + return size < PAGE_CACHE_SIZE; +#else + return 0; +#endif +} + static inline int fsblock_subpage(struct fsblock *block) { return fsblock_bits(block) < PAGE_CACHE_SHIFT; @@ -276,6 +287,9 @@ struct fsblock_meta *find_or_create_mblo struct fsblock_meta *mbread(struct block_device *bdev, sector_t blocknr, unsigned int size); +int fsblock_register_super(struct super_block *sb); +void fsblock_unregister_super(struct super_block *sb); + static inline struct fsblock_meta *sb_find_get_mblock(struct super_block *sb, sector_t blocknr) { return find_get_mblock(sb->s_bdev, blocknr, sb->s_blocksize); Index: linux-2.6/include/linux/fsblock_types.h =================================================================== --- linux-2.6.orig/include/linux/fsblock_types.h +++ linux-2.6/include/linux/fsblock_types.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> +#include <linux/rbtree.h> #include <asm/atomic.h> #define FSB_DEBUG 1 @@ -38,6 +39,7 @@ struct fsblock { atomic_t count; unsigned long flags; /* XXX: flags could be int for better packing */ + struct rb_node block_node; sector_t block_nr; void *private; struct page *page; /* Superpage block pages found via ->mapping */ - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html