> This is a first step at introducing per-bdi flusher threads. We should > have no change in behaviour, although sb_has_dirty_inodes() is now > ridiculously expensive, as there's no easy way to answer that question. > Not a huge problem, since it'll be deleted in subsequent patches. Could you maybe expand the changelog a bit? If I read the patch right the only thing it does is that it moves from per-sb inode lists to per-bdi inode lists, right? Also sync_sb_inodes() now writes all the inodes in the system, not just the ones for that superblock, doesn't it? Honza > > Signed-off-by: Jens Axboe <jens.axboe@xxxxxxxxxx> > --- > fs/fs-writeback.c | 186 +++++++++++++++++++++++++++---------------- > fs/super.c | 3 - > include/linux/backing-dev.h | 9 ++ > include/linux/fs.h | 5 +- > mm/backing-dev.c | 31 +++++++- > mm/page-writeback.c | 1 - > 6 files changed, 156 insertions(+), 79 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index e5eaa62..c107cff 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -25,6 +25,7 @@ > #include <linux/buffer_head.h> > #include "internal.h" > > +#define inode_to_bdi(inode) (inode)->i_mapping->backing_dev_info > > /** > * writeback_acquire - attempt to get exclusive writeback access to a device > @@ -158,12 +159,13 @@ void __mark_inode_dirty(struct inode *inode, int flags) > goto out; > > /* > - * If the inode was already on s_dirty/s_io/s_more_io, don't > - * reposition it (that would break s_dirty time-ordering). > + * If the inode was already on b_dirty/b_io/b_more_io, don't > + * reposition it (that would break b_dirty time-ordering). > */ > if (!was_dirty) { > inode->dirtied_when = jiffies; > - list_move(&inode->i_list, &sb->s_dirty); > + list_move(&inode->i_list, > + &inode_to_bdi(inode)->b_dirty); > } > } > out: > @@ -184,31 +186,30 @@ static int write_inode(struct inode *inode, int sync) > * furthest end of its superblock's dirty-inode list. > * > * Before stamping the inode's ->dirtied_when, we check to see whether it is > - * already the most-recently-dirtied inode on the s_dirty list. If that is > + * already the most-recently-dirtied inode on the b_dirty list. If that is > * the case then the inode must have been redirtied while it was being written > * out and we don't reset its dirtied_when. > */ > static void redirty_tail(struct inode *inode) > { > - struct super_block *sb = inode->i_sb; > + struct backing_dev_info *bdi = inode_to_bdi(inode); > > - if (!list_empty(&sb->s_dirty)) { > - struct inode *tail_inode; > + if (!list_empty(&bdi->b_dirty)) { > + struct inode *tail; > > - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); > - if (!time_after_eq(inode->dirtied_when, > - tail_inode->dirtied_when)) > + tail = list_entry(bdi->b_dirty.next, struct inode, i_list); > + if (!time_after_eq(inode->dirtied_when, tail->dirtied_when)) > inode->dirtied_when = jiffies; > } > - list_move(&inode->i_list, &sb->s_dirty); > + list_move(&inode->i_list, &bdi->b_dirty); > } > > /* > - * requeue inode for re-scanning after sb->s_io list is exhausted. > + * requeue inode for re-scanning after bdi->b_io list is exhausted. > */ > static void requeue_io(struct inode *inode) > { > - list_move(&inode->i_list, &inode->i_sb->s_more_io); > + list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); > } > > static void inode_sync_complete(struct inode *inode) > @@ -240,18 +241,50 @@ static void move_expired_inodes(struct list_head *delaying_queue, > /* > * Queue all expired dirty inodes for io, eldest first. > */ > -static void queue_io(struct super_block *sb, > - unsigned long *older_than_this) > +static void queue_io(struct backing_dev_info *bdi, > + unsigned long *older_than_this) > { > - list_splice_init(&sb->s_more_io, sb->s_io.prev); > - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); > + list_splice_init(&bdi->b_more_io, bdi->b_io.prev); > + move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); > +} > + > +static int sb_on_inode_list(struct super_block *sb, struct list_head *list) > +{ > + struct inode *inode; > + int ret = 0; > + > + spin_lock(&inode_lock); > + list_for_each_entry(inode, list, i_list) { > + if (inode->i_sb == sb) { > + ret = 1; > + break; > + } > + } > + spin_unlock(&inode_lock); > + return ret; > } > > int sb_has_dirty_inodes(struct super_block *sb) > { > - return !list_empty(&sb->s_dirty) || > - !list_empty(&sb->s_io) || > - !list_empty(&sb->s_more_io); > + struct backing_dev_info *bdi; > + int ret = 0; > + > + /* > + * This is REALLY expensive right now, but it'll go away > + * when the bdi writeback is introduced > + */ > + rcu_read_lock(); > + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { > + if (sb_on_inode_list(sb, &bdi->b_dirty) || > + sb_on_inode_list(sb, &bdi->b_io) || > + sb_on_inode_list(sb, &bdi->b_more_io)) { > + ret = 1; > + break; > + } > + } > + rcu_read_unlock(); > + > + return ret; > } > EXPORT_SYMBOL(sb_has_dirty_inodes); > > @@ -305,11 +338,11 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) > /* > * We didn't write back all the pages. nfs_writepages() > * sometimes bales out without doing anything. Redirty > - * the inode; Move it from s_io onto s_more_io/s_dirty. > + * the inode; Move it from b_io onto b_more_io/b_dirty. > */ > /* > * akpm: if the caller was the kupdate function we put > - * this inode at the head of s_dirty so it gets first > + * this inode at the head of b_dirty so it gets first > * consideration. Otherwise, move it to the tail, for > * the reasons described there. I'm not really sure > * how much sense this makes. Presumably I had a good > @@ -319,7 +352,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) > if (wbc->for_kupdate) { > /* > * For the kupdate function we move the inode > - * to s_more_io so it will get more writeout as > + * to b_more_io so it will get more writeout as > * soon as the queue becomes uncongested. > */ > inode->i_state |= I_DIRTY_PAGES; > @@ -385,10 +418,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) { > /* > * We're skipping this inode because it's locked, and we're not > - * doing writeback-for-data-integrity. Move it to s_more_io so > - * that writeback can proceed with the other inodes on s_io. > + * doing writeback-for-data-integrity. Move it to b_more_io so > + * that writeback can proceed with the other inodes on b_io. > * We'll have another go at writing back this inode when we > - * completed a full scan of s_io. > + * completed a full scan of b_io. > */ > requeue_io(inode); > return 0; > @@ -411,51 +444,24 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > return __sync_single_inode(inode, wbc); > } > > -/* > - * Write out a superblock's list of dirty inodes. A wait will be performed > - * upon no inodes, all inodes or the final one, depending upon sync_mode. > - * > - * If older_than_this is non-NULL, then only write out inodes which > - * had their first dirtying at a time earlier than *older_than_this. > - * > - * If we're a pdlfush thread, then implement pdflush collision avoidance > - * against the entire list. > - * > - * If `bdi' is non-zero then we're being asked to writeback a specific queue. > - * This function assumes that the blockdev superblock's inodes are backed by > - * a variety of queues, so all inodes are searched. For other superblocks, > - * assume that all inodes are backed by the same queue. > - * > - * FIXME: this linear search could get expensive with many fileystems. But > - * how to fix? We need to go from an address_space to all inodes which share > - * a queue with that address_space. (Easy: have a global "dirty superblocks" > - * list). > - * > - * The inodes to be written are parked on sb->s_io. They are moved back onto > - * sb->s_dirty as they are selected for writing. This way, none can be missed > - * on the writer throttling path, and we get decent balancing between many > - * throttled threads: we don't want them all piling up on inode_sync_wait. > - */ > -void generic_sync_sb_inodes(struct super_block *sb, > - struct writeback_control *wbc) > +static void generic_sync_bdi_inodes(struct backing_dev_info *bdi, > + struct writeback_control *wbc, > + int is_blkdev) > { > const unsigned long start = jiffies; /* livelock avoidance */ > - int sync = wbc->sync_mode == WB_SYNC_ALL; > > spin_lock(&inode_lock); > - if (!wbc->for_kupdate || list_empty(&sb->s_io)) > - queue_io(sb, wbc->older_than_this); > + if (!wbc->for_kupdate || list_empty(&bdi->b_io)) > + queue_io(bdi, wbc->older_than_this); > > - while (!list_empty(&sb->s_io)) { > - struct inode *inode = list_entry(sb->s_io.prev, > + while (!list_empty(&bdi->b_io)) { > + struct inode *inode = list_entry(bdi->b_io.prev, > struct inode, i_list); > - struct address_space *mapping = inode->i_mapping; > - struct backing_dev_info *bdi = mapping->backing_dev_info; > long pages_skipped; > > if (!bdi_cap_writeback_dirty(bdi)) { > redirty_tail(inode); > - if (sb_is_blkdev_sb(sb)) { > + if (is_blkdev) { > /* > * Dirty memory-backed blockdev: the ramdisk > * driver does this. Skip just this inode > @@ -472,14 +478,14 @@ void generic_sync_sb_inodes(struct super_block *sb, > > if (wbc->nonblocking && bdi_write_congested(bdi)) { > wbc->encountered_congestion = 1; > - if (!sb_is_blkdev_sb(sb)) > + if (!is_blkdev) > break; /* Skip a congested fs */ > requeue_io(inode); > continue; /* Skip a congested blockdev */ > } > > if (wbc->bdi && bdi != wbc->bdi) { > - if (!sb_is_blkdev_sb(sb)) > + if (!is_blkdev) > break; /* fs has the wrong queue */ > requeue_io(inode); > continue; /* blockdev has wrong queue */ > @@ -514,13 +520,55 @@ void generic_sync_sb_inodes(struct super_block *sb, > wbc->more_io = 1; > break; > } > - if (!list_empty(&sb->s_more_io)) > + if (!list_empty(&bdi->b_more_io)) > wbc->more_io = 1; > } > > - if (sync) { > + spin_unlock(&inode_lock); > + /* Leave any unwritten inodes on b_io */ > +} > + > +/* > + * Write out a superblock's list of dirty inodes. A wait will be performed > + * upon no inodes, all inodes or the final one, depending upon sync_mode. > + * > + * If older_than_this is non-NULL, then only write out inodes which > + * had their first dirtying at a time earlier than *older_than_this. > + * > + * If we're a pdlfush thread, then implement pdflush collision avoidance > + * against the entire list. > + * > + * If `bdi' is non-zero then we're being asked to writeback a specific queue. > + * This function assumes that the blockdev superblock's inodes are backed by > + * a variety of queues, so all inodes are searched. For other superblocks, > + * assume that all inodes are backed by the same queue. > + * > + * FIXME: this linear search could get expensive with many fileystems. But > + * how to fix? We need to go from an address_space to all inodes which share > + * a queue with that address_space. (Easy: have a global "dirty superblocks" > + * list). > + * > + * The inodes to be written are parked on bdi->b_io. They are moved back onto > + * bdi->b_dirty as they are selected for writing. This way, none can be missed > + * on the writer throttling path, and we get decent balancing between many > + * throttled threads: we don't want them all piling up on inode_sync_wait. > + */ > +void generic_sync_sb_inodes(struct super_block *sb, > + struct writeback_control *wbc) > +{ > + const int is_blkdev = sb_is_blkdev_sb(sb); > + struct backing_dev_info *bdi; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) > + generic_sync_bdi_inodes(bdi, wbc, is_blkdev); > + rcu_read_unlock(); > + > + if (wbc->sync_mode == WB_SYNC_ALL) { > struct inode *inode, *old_inode = NULL; > > + spin_lock(&inode_lock); > + > /* > * Data integrity sync. Must wait for all pages under writeback, > * because there may have been pages dirtied before our sync > @@ -557,10 +605,8 @@ void generic_sync_sb_inodes(struct super_block *sb, > } > spin_unlock(&inode_lock); > iput(old_inode); > - } else > - spin_unlock(&inode_lock); > + } > > - return; /* Leave any unwritten inodes on s_io */ > } > EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); > > @@ -575,8 +621,8 @@ static void sync_sb_inodes(struct super_block *sb, > * > * Note: > * We don't need to grab a reference to superblock here. If it has non-empty > - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed > - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all > + * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed > + * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all > * empty. Since __sync_single_inode() regains inode_lock before it finally moves > * inode from superblock lists we are OK. > * > diff --git a/fs/super.c b/fs/super.c > index 8349ed6..e3c5b6f 100644 > --- a/fs/super.c > +++ b/fs/super.c > @@ -64,9 +64,6 @@ static struct super_block *alloc_super(struct file_system_type *type) > s = NULL; > goto out; > } > - INIT_LIST_HEAD(&s->s_dirty); > - INIT_LIST_HEAD(&s->s_io); > - INIT_LIST_HEAD(&s->s_more_io); > INIT_LIST_HEAD(&s->s_files); > INIT_LIST_HEAD(&s->s_instances); > INIT_HLIST_HEAD(&s->s_anon); > diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h > index bee52ab..bb58c95 100644 > --- a/include/linux/backing-dev.h > +++ b/include/linux/backing-dev.h > @@ -40,6 +40,8 @@ enum bdi_stat_item { > #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) > > struct backing_dev_info { > + struct list_head bdi_list; > + > unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ > unsigned long state; /* Always use atomic bitops on this */ > unsigned int capabilities; /* Device capabilities */ > @@ -58,6 +60,10 @@ struct backing_dev_info { > > struct device *dev; > > + struct list_head b_dirty; /* dirty inodes */ > + struct list_head b_io; /* parked for writeback */ > + struct list_head b_more_io; /* parked for more writeback */ > + > #ifdef CONFIG_DEBUG_FS > struct dentry *debug_dir; > struct dentry *debug_stats; > @@ -72,6 +78,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, > int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); > void bdi_unregister(struct backing_dev_info *bdi); > > +extern spinlock_t bdi_lock; > +extern struct list_head bdi_list; > + > static inline void __add_bdi_stat(struct backing_dev_info *bdi, > enum bdi_stat_item item, s64 amount) > { > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 92734c0..3c90eb4 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -648,7 +648,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping) > > struct inode { > struct hlist_node i_hash; > - struct list_head i_list; > + struct list_head i_list; /* backing dev IO list */ > struct list_head i_sb_list; > struct list_head i_dentry; > unsigned long i_ino; > @@ -1155,9 +1155,6 @@ struct super_block { > struct xattr_handler **s_xattr; > > struct list_head s_inodes; /* all inodes */ > - struct list_head s_dirty; /* dirty inodes */ > - struct list_head s_io; /* parked for writeback */ > - struct list_head s_more_io; /* parked for more writeback */ > struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ > struct list_head s_files; > /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ > diff --git a/mm/backing-dev.c b/mm/backing-dev.c > index 8e85874..cf1528b 100644 > --- a/mm/backing-dev.c > +++ b/mm/backing-dev.c > @@ -7,8 +7,9 @@ > #include <linux/writeback.h> > #include <linux/device.h> > > - > static struct class *bdi_class; > +DEFINE_SPINLOCK(bdi_lock); > +LIST_HEAD(bdi_list); > > #ifdef CONFIG_DEBUG_FS > #include <linux/debugfs.h> > @@ -187,6 +188,10 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, > goto exit; > } > > + spin_lock(&bdi_lock); > + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); > + spin_unlock(&bdi_lock); > + > bdi->dev = dev; > bdi_debug_register(bdi, dev_name(dev)); > > @@ -201,9 +206,23 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) > } > EXPORT_SYMBOL(bdi_register_dev); > > +static void bdi_remove_from_list(struct backing_dev_info *bdi) > +{ > + spin_lock(&bdi_lock); > + list_del_rcu(&bdi->bdi_list); > + spin_unlock(&bdi_lock); > + > + /* > + * In case the bdi is freed right after unregister, we need to > + * make sure any RCU sections have exited > + */ > + synchronize_rcu(); > +} > + > void bdi_unregister(struct backing_dev_info *bdi) > { > if (bdi->dev) { > + bdi_remove_from_list(bdi); > bdi_debug_unregister(bdi); > device_unregister(bdi->dev); > bdi->dev = NULL; > @@ -221,6 +240,10 @@ int bdi_init(struct backing_dev_info *bdi) > bdi->min_ratio = 0; > bdi->max_ratio = 100; > bdi->max_prop_frac = PROP_FRAC_BASE; > + INIT_LIST_HEAD(&bdi->bdi_list); > + INIT_LIST_HEAD(&bdi->b_io); > + INIT_LIST_HEAD(&bdi->b_dirty); > + INIT_LIST_HEAD(&bdi->b_more_io); > > for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { > err = percpu_counter_init(&bdi->bdi_stat[i], 0); > @@ -235,6 +258,8 @@ int bdi_init(struct backing_dev_info *bdi) > err: > while (i--) > percpu_counter_destroy(&bdi->bdi_stat[i]); > + > + bdi_remove_from_list(bdi); > } > > return err; > @@ -245,6 +270,10 @@ void bdi_destroy(struct backing_dev_info *bdi) > { > int i; > > + WARN_ON(!list_empty(&bdi->b_dirty)); > + WARN_ON(!list_empty(&bdi->b_io)); > + WARN_ON(!list_empty(&bdi->b_more_io)); > + > bdi_unregister(bdi); > > for (i = 0; i < NR_BDI_STAT_ITEMS; i++) > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 74dc57c..3ec11d8 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -319,7 +319,6 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty) > /* > * > */ > -static DEFINE_SPINLOCK(bdi_lock); > static unsigned int bdi_min_ratio; > > int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) > -- > 1.6.2 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- Jan Kara <jack@xxxxxxx> SuSE CR Labs -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html