Switch inode dirty tracking lists to be per superblock instead of per bdi. This is a major step towards filesystems being able to do their own dirty tracking and selection of inodes for writeback if they desire so (e.g. because they journal or COW data and need to writeback inodes & pages in a specific order unknown to generic writeback code). Per superblock dirty lists also make selecting inodes for writeback somewhat simpler because we don't have to search for inodes from a particular superblock for some kinds of writeback (OTOH we pay for this by having to iterate through superblocks for all-bdi type of writeback) and this simplification will allow for an easier switch to a better scaling data structure for dirty inodes. Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/block_dev.c | 11 +++-- fs/fs-writeback.c | 114 +++++++++++++++++++++++++++----------------- fs/inode.c | 8 ++-- fs/super.c | 9 ++++ include/linux/backing-dev.h | 23 +++++---- include/linux/fs.h | 18 +++++++ mm/backing-dev.c | 92 ++++++++++++++++++++++------------- mm/filemap.c | 2 +- 8 files changed, 179 insertions(+), 98 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..01310d2c40a3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -62,17 +62,18 @@ static void bdev_inode_switch_bdi(struct inode *inode, if (unlikely(dst == old)) /* deadlock avoidance */ return; - bdi_lock_two(&old->wb, &dst->wb); + bdi_lock_two(&old->wb_queue, &dst->wb_queue); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) { - if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) + if (bdi_cap_writeback_dirty(dst) && + !wb_has_dirty_io(&dst->wb_queue)) wakeup_bdi = true; - list_move(&inode->i_wb_list, &dst->wb.b_dirty); + list_move(&inode->i_wb_list, &dst->wb_queue.b_dirty); } spin_unlock(&inode->i_lock); - spin_unlock(&old->wb.list_lock); - spin_unlock(&dst->wb.list_lock); + spin_unlock(&old->wb_queue.list_lock); + spin_unlock(&dst->wb_queue.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(dst); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f9d8aa7f1ff7..e80d1b9ac355 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -76,6 +76,15 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (sb_is_blkdev_sb(sb)) + return &inode->i_mapping->backing_dev_info->wb_queue; + return &sb->s_dirty_inodes; +} + static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); @@ -184,11 +193,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb_queue = inode_to_wb(inode); - spin_lock(&bdi->wb.list_lock); + spin_lock(&wb_queue->list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + spin_unlock(&wb_queue->list_lock); } /* @@ -480,7 +489,7 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { int ret = 0; - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct bdi_writeback *wb_queue = inode_to_wb(inode); spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -516,7 +525,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = __writeback_single_inode(inode, wbc); - spin_lock(&wb->list_lock); + spin_lock(&wb_queue->list_lock); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't @@ -524,7 +533,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) */ if (!(inode->i_state & I_DIRTY)) list_del_init(&inode->i_wb_list); - spin_unlock(&wb->list_lock); + spin_unlock(&wb_queue->list_lock); inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); @@ -563,8 +572,7 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, } /* - * Refill b_io list if needed and start writing inodes on that list belonging - * to @work->sb (if set). + * Refill b_io list if needed and start writing inodes on that list * * Return the number of pages and/or inodes written. */ @@ -591,16 +599,6 @@ static long writeback_inodes(struct bdi_writeback *wb, while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); - if (work->sb && inode->i_sb != work->sb) { - /* - * We only want to write back data for this - * superblock, move all inodes not belonging - * to it back onto the dirty list. - */ - redirty_tail(inode, wb); - continue; - } - /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter @@ -672,6 +670,15 @@ static long writeback_inodes(struct bdi_writeback *wb, break; } } + + /* + * In case we made no progress in current IO batch and there are no + * inodes postponed for further writeback, set WB_STATE_STALLED + * so that flusher doesn't busyloop in case no dirty inodes can be + * written. + */ + if (!wrote && list_empty(&wb->b_more_io)) + wb->state |= WB_STATE_STALLED; spin_unlock(&wb->list_lock); return wrote; @@ -729,8 +736,8 @@ static long bdi_writeback(struct backing_dev_info *bdi, unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; - long progress; - struct bdi_writeback *wb = &bdi->wb; + long progress = 1; + struct bdi_writeback *wb; oldest_jif = jiffies; work->older_than_this = &oldest_jif; @@ -771,26 +778,47 @@ static long bdi_writeback(struct backing_dev_info *bdi, } else if (work->for_background) oldest_jif = jiffies; + /* + * If we made some progress, clear stalled state to retry other + * writeback queues as well. + */ + if (progress) { + spin_lock_bh(&bdi->wb_lock); + list_for_each_entry(wb, &bdi->wq_list, bdi_list) { + wb->state &= ~WB_STATE_STALLED; + } + spin_unlock_bh(&bdi->wb_lock); + } + + if (work->sb) { + wb = &work->sb->s_dirty_inodes; + if (wb->state & WB_STATE_STALLED) + wb = NULL; + } else { + spin_lock_bh(&bdi->wb_lock); + list_for_each_entry(wb, &bdi->wq_list, bdi_list) { + if (!(wb->state & WB_STATE_STALLED) && + wb_has_dirty_io(wb)) { + /* + * Make us start with the following + * writeback queue next time + */ + list_move(&bdi->wq_list, &wb->bdi_list); + goto got_wb; + } + } + wb = NULL; +got_wb: + spin_unlock_bh(&bdi->wb_lock); + + } + /* No more dirty inodes. Stop writeback. */ + if (!wb) + break; trace_writeback_start(bdi, work); progress = writeback_inodes(wb, work); trace_writeback_written(bdi, work); - update_bandwidth(bdi, wb_start); - /* - * Did we write something? Try for more - * - * Dirty inodes are moved to b_io for writeback in batches. - * The completion of the current batch does not necessarily - * mean the overall work is done. So we keep looping as long - * as made some progress on cleaning pages or inodes. - */ - if (progress) - continue; - /* - * No more inodes for IO, bail - */ - if (list_empty(&wb->b_more_io)) - break; } return nr_pages - work->nr_pages; @@ -1051,7 +1079,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = NULL; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1110,27 +1137,28 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!was_dirty) { bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); + struct bdi_writeback *wb_queue = inode_to_wb(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); + spin_lock(&wb_queue->list_lock); if (bdi_cap_writeback_dirty(bdi)) { WARN(!test_bit(BDI_registered, &bdi->state), "bdi-%s not registered\n", bdi->name); /* * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding + * sb, we will wake-up the corresponding * bdi thread to make sure background * write-back happens later. */ - if (!wb_has_dirty_io(&bdi->wb)) + if (!wb_has_dirty_io(wb_queue)) wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&bdi->wb.list_lock); + list_move(&inode->i_wb_list, &wb_queue->b_dirty); + spin_unlock(&wb_queue->list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); diff --git a/fs/inode.c b/fs/inode.c index 6eecb7ff0b9a..a9d40e57f73d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -29,8 +29,10 @@ * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * sb->s_dirty_inodes.list_lock protects: + * sb->s_dirty_inodes.b_{dirty,io,more_io}, inode->i_wb_list + * Block device inodes are an exception and their i_wb_list is protected by + * bdi->wb_queue.list_lock * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -40,7 +42,7 @@ * inode->i_lock * Inode LRU list locks * - * bdi->wb.list_lock + * sb->s_dirty_inodes.list_lock * inode->i_lock * * inode_hash_lock diff --git a/fs/super.c b/fs/super.c index d20d5b11dedf..9e4867da6c5d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -188,6 +188,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); + bdi_writeback_queue_init(&s->s_dirty_inodes, s); if (list_lru_init(&s->s_dentry_lru)) goto fail; @@ -995,6 +996,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, goto error; } + bdi_writeback_queue_register(&s->s_dirty_inodes); s->s_flags |= MS_ACTIVE; bdev->bd_super = s; } @@ -1015,6 +1017,13 @@ void kill_block_super(struct super_block *sb) struct block_device *bdev = sb->s_bdev; fmode_t mode = sb->s_mode; + /* + * Unregister superblock from periodic writeback. There may be + * writeback still running for it but we call sync_filesystem() later + * and that will execute only after any background writeback is stopped. + * This guarantees flusher won't touch sb that's going away. + */ + bdi_writeback_queue_unregister(&sb->s_dirty_inodes); bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5bfe30d6f01f..ff3e2a3eb326 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -48,15 +48,6 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - spinlock_t list_lock; /* protects the b_* lists */ -}; - struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ @@ -94,10 +85,13 @@ struct backing_dev_info { struct delayed_work dwork; /* work item used for writeback */ - struct bdi_writeback wb; /* default writeback info for this bdi */ spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling, - updates of bandwidth & ratelimit */ + updates of bandwidth & ratelimit, sb_list */ + struct bdi_writeback wb_queue; /* default writeback queue for this bdi. + Used for block device inodes of this + device. */ + struct list_head wq_list; /* list of writeback queues on this bdi */ struct list_head work_list; struct device *dev; @@ -112,7 +106,9 @@ struct backing_dev_info { static inline struct backing_dev_info *wb_bdi(struct bdi_writeback *wb) { - return wb->bdi; + if (!wb->sb) + return container_of(wb, struct backing_dev_info, wb_queue); + return wb->sb->s_bdi; } int __must_check bdi_init(struct backing_dev_info *bdi); @@ -124,6 +120,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); +void bdi_writeback_queue_init(struct bdi_writeback *wb, struct super_block *sb); +void bdi_writeback_queue_register(struct bdi_writeback *wb_queue); +void bdi_writeback_queue_unregister(struct bdi_writeback *wb_queue); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); diff --git a/include/linux/fs.h b/include/linux/fs.h index e11d60cc867b..894fb42438ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1172,6 +1172,22 @@ struct sb_writers { #endif }; +#define WB_STATE_STALLED 0x01 /* Writeback for this queue is stalled */ + +struct bdi_writeback { + struct super_block *sb; /* our parent superblock, + NULL for default bdi queue */ + struct list_head bdi_list; /* List of writeback queues on bdi */ + + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + spinlock_t list_lock; /* protects the b_* lists */ + int state; /* state of writeback in this queue, + * manipulated only from flusher -> + * no locking */ +}; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1203,6 +1219,8 @@ struct super_block { struct hlist_node s_instances; struct quota_info s_dquot; /* Diskquota specific options */ + struct bdi_writeback s_dirty_inodes; /* Tracking of dirty inodes */ + struct sb_writers s_writers; char s_id[32]; /* Informational name */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c44ba43d580d..10ab9c34e155 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -65,22 +65,9 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; - struct inode *inode; - - nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) - nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) - nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) - nr_more_io++; - spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -95,9 +82,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" - "b_dirty: %10lu\n" - "b_io: %10lu\n" - "b_more_io: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -108,9 +92,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), (unsigned long) K(bdi->write_bandwidth), - nr_dirty, - nr_io, - nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -275,7 +256,17 @@ subsys_initcall(default_bdi_init); int bdi_has_dirty_io(struct backing_dev_info *bdi) { - return wb_has_dirty_io(&bdi->wb); + struct bdi_writeback *wb_queue; + + spin_lock_bh(&bdi->wb_lock); + list_for_each_entry(wb_queue, &bdi->wq_list, bdi_list) { + if (wb_has_dirty_io(wb_queue)) { + spin_unlock_bh(&bdi->wb_lock); + return 1; + } + } + spin_unlock_bh(&bdi->wb_lock); + return 0; } /* @@ -421,15 +412,43 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +void bdi_writeback_queue_init(struct bdi_writeback *wb, struct super_block *sb) { - memset(wb, 0, sizeof(*wb)); - - wb->bdi = bdi; + wb->sb = sb; + INIT_LIST_HEAD(&wb->bdi_list); INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); + wb->state = 0; +} + +/* + * Register writeback queue with BDI so that background writeback is run for + * it. + */ +void bdi_writeback_queue_register(struct bdi_writeback *wb_queue) +{ + struct backing_dev_info *bdi = wb_bdi(wb_queue); + + spin_lock_bh(&bdi->wb_lock); + list_add(&wb_queue->bdi_list, &bdi->wq_list); + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * Unregister writeback queue from BDI. No further background writeback will be + * started against this superblock. However note that there may be writeback + * still running for the sb. + */ +void bdi_writeback_queue_unregister(struct bdi_writeback *wb_queue) +{ + struct backing_dev_info *bdi = wb_bdi(wb_queue); + + /* Make sure flusher cannot find the superblock any longer */ + spin_lock_bh(&bdi->wb_lock); + list_del_init(&wb_queue->bdi_list); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -449,10 +468,12 @@ int bdi_init(struct backing_dev_info *bdi) spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->work_list); + INIT_LIST_HEAD(&bdi->wq_list); bdi->last_old_flush = jiffies; INIT_DELAYED_WORK(&bdi->dwork, bdi_writeback_workfn); - bdi_wb_init(&bdi->wb, bdi); + bdi_writeback_queue_init(&bdi->wb_queue, NULL); + bdi_writeback_queue_register(&bdi->wb_queue); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -486,18 +507,21 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + /* bdi disappearing under fs. Bad, bad, bad! */ + BUG_ON(!list_is_singular(&bdi->wq_list)); /* * Splice our entries to the default_backing_dev_info, if this - * bdi disappears + * bdi disappears. We can still hold some device inodes in dirty lists */ - if (bdi_has_dirty_io(bdi)) { - struct bdi_writeback *dst = &default_backing_dev_info.wb; - - bdi_lock_two(&bdi->wb, dst); - list_splice(&bdi->wb.b_dirty, &dst->b_dirty); - list_splice(&bdi->wb.b_io, &dst->b_io); - list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&bdi->wb.list_lock); + if (wb_has_dirty_io(&bdi->wb_queue)) { + struct bdi_writeback *dst = &default_backing_dev_info.wb_queue; + struct bdi_writeback *src = &bdi->wb_queue; + + bdi_lock_two(src, dst); + list_splice(&src->b_dirty, &dst->b_dirty); + list_splice(&src->b_io, &dst->b_io); + list_splice(&src->b_more_io, &dst->b_more_io); + spin_unlock(&src->list_lock); spin_unlock(&dst->list_lock); } diff --git a/mm/filemap.c b/mm/filemap.c index dafb06f70a09..cbc3c647a190 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,7 +80,7 @@ * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * bdi->wb.list_lock + * bdi->wb_queue.list_lock / sb->s_dirty_inodes.list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html