On 10/21/2010 06:55 PM, Ted Ts'o wrote: > From be341f01064389dd7d82ed0379765656a816aa8a Mon Sep 17 00:00:00 2001 > From: Theodore Ts'o <tytso@xxxxxxx> > Date: Wed, 20 Oct 2010 21:27:13 -0400 > Subject: [PATCH 6/6] Ext4: Use bio layer instead of buffer layer in mpage_da_submit_io > > Rough draft; not done yet > --- > fs/buffer.c | 36 +++++ > fs/ext4/Makefile | 2 +- > fs/ext4/ext4.h | 33 ++++- > fs/ext4/extents.c | 4 +- > fs/ext4/inode.c | 118 ++------------- > fs/ext4/mballoc.c | 8 +- > fs/ext4/page-io.c | 390 +++++++++++++++++++++++++++++++++++++++++++++++++ > fs/ext4/super.c | 8 +- > fs/jbd2/commit.c | 17 ++ > fs/jbd2/transaction.c | 5 + > mm/filemap.c | 7 + > 11 files changed, 518 insertions(+), 110 deletions(-) > create mode 100644 fs/ext4/page-io.c > > diff --git a/fs/buffer.c b/fs/buffer.c > index 3e7dca2..8789975 100644 > --- a/fs/buffer.c > +++ b/fs/buffer.c <snip> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index f379ceb..2ceb1e8 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -2016,8 +2016,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, > struct buffer_head *bh, *page_bufs = NULL; > int journal_data = ext4_should_journal_data(inode); > sector_t pblock, cur_logical; > + struct ext4_io_submit io_submit; > > BUG_ON(mpd->next_page <= mpd->first_page); > + memset(&io_submit, 0, sizeof(io_submit)); > /* > * We need to start from the first_page to the next_page - 1 > * to make sure we also write the mapped dirty buffer_heads. > @@ -2109,16 +2111,16 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, > /* mark the buffer_heads as dirty & uptodate */ > block_commit_write(page, 0, len); > > - if (journal_data && PageChecked(page)) > + /* > + * Delalloc doesn't support data journalling, > + * but eventually maybe we'll lift this > + * restriction. > + */ > + if (unlikely(journal_data && PageChecked(page))) > err = __ext4_journalled_writepage(page, len); > - else if (buffer_uninit(page_bufs)) { > - ext4_set_bh_endio(page_bufs, inode); > - err = block_write_full_page_endio(page, > - noalloc_get_block_write, > - mpd->wbc, ext4_end_io_buffer_write); > - } else > - err = block_write_full_page(page, > - noalloc_get_block_write, mpd->wbc); > + else > + err = ext4_bio_write_page(&io_submit, page, > + len, mpd->wbc); > > if (!err) > mpd->pages_written++; > @@ -2131,6 +2133,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, > } > pagevec_release(&pvec); > } > + ext4_io_submit(&io_submit); > return ret; > } > <snip> > diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c > new file mode 100644 > index 0000000..08a6b2a > --- /dev/null > +++ b/fs/ext4/page-io.c > @@ -0,0 +1,390 @@ > +/* > + * linux/fs/ext4/page-io.c > + * > + * This contains the new page_io functions for ext4 > + * > + * Written by Theodore Ts'o, 2010. > + */ > + > +#include <linux/module.h> > +#include <linux/fs.h> > +#include <linux/time.h> > +#include <linux/jbd2.h> > +#include <linux/highuid.h> > +#include <linux/pagemap.h> > +#include <linux/quotaops.h> > +#include <linux/string.h> > +#include <linux/buffer_head.h> > +#include <linux/writeback.h> > +#include <linux/pagevec.h> > +#include <linux/mpage.h> > +#include <linux/namei.h> > +#include <linux/uio.h> > +#include <linux/bio.h> > +#include <linux/workqueue.h> > +#include <linux/kernel.h> > +#include <linux/slab.h> > + > +#include "ext4_jbd2.h" > +#include "xattr.h" > +#include "acl.h" > +#include "ext4_extents.h" > + > +#define PDEBUG > + > +static struct kmem_cache *io_page_cachep; > + > +int __init init_ext4_pageio(void) > +{ > + io_page_cachep = > + kmem_cache_create("ext4_io_page", > + sizeof(struct ext4_io_page), > + 0, SLAB_RECLAIM_ACCOUNT, NULL); > + if (io_page_cachep == NULL) > + return -ENOMEM; > + > + return 0; > +} > + > +void exit_ext4_pageio(void) > +{ > + kmem_cache_destroy(io_page_cachep); > +} > + > +void ext4_free_io_end(ext4_io_end_t *io) > +{ > + struct ext4_io_page *io_page, *next; > + > + BUG_ON(!io); > + if (io->page) > + put_page(io->page); > + iput(io->inode); > + for (io_page = io->io_page; io_page; io_page = next) { > + put_page(io_page->p_page); > + next = io_page->p_next; > + kmem_cache_free(io_page_cachep, io_page); > + } > + kfree(io); > +} > + > +/* > + * check a range of space and convert unwritten extents to written. > + */ > +int ext4_end_io_nolock(ext4_io_end_t *io) > +{ > + struct inode *inode = io->inode; > + loff_t offset = io->offset; > + ssize_t size = io->size; > + int ret = 0; > + > + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," > + "list->prev 0x%p\n", > + io, inode->i_ino, io->list.next, io->list.prev); > + > + if (list_empty(&io->list)) > + return ret; > + > + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) > + return ret; > + > + ret = ext4_convert_unwritten_extents(inode, offset, size); > + if (ret < 0) { > + printk(KERN_EMERG "%s: failed to convert unwritten" > + "extents to written extents, error is %d" > + " io is still on inode %lu aio dio list\n", > + __func__, ret, inode->i_ino); > + return ret; > + } > + > + if (io->iocb) > + aio_complete(io->iocb, io->result, 0); > + /* clear the DIO AIO unwritten flag */ > + io->flag &= ~EXT4_IO_END_UNWRITTEN; > + return ret; > +} > + > +/* > + * work on completed aio dio IO, to convert unwritten extents to extents > + */ > +static void ext4_end_io_work(struct work_struct *work) > +{ > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); > + struct inode *inode = io->inode; > + struct ext4_inode_info *ei = EXT4_I(inode); > + unsigned long flags; > + int ret; > + > + mutex_lock(&inode->i_mutex); > + ret = ext4_end_io_nolock(io); > + if (ret < 0) { > + mutex_unlock(&inode->i_mutex); > + return; > + } > + > + spin_lock_irqsave(&ei->i_completed_io_lock, flags); > + if (!list_empty(&io->list)) > + list_del_init(&io->list); > + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); > + mutex_unlock(&inode->i_mutex); > + ext4_free_io_end(io); > +} > + > +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) > +{ > + ext4_io_end_t *io = NULL; > + > + io = kmalloc(sizeof(*io), flags); > + > + if (io) { > + memset(io, 0, sizeof(*io)); > + igrab(inode); > + io->inode = inode; > + INIT_WORK(&io->work, ext4_end_io_work); > + INIT_LIST_HEAD(&io->list); > + } > + > + return io; > +} > + > +static void ext4_end_bio(struct bio *bio, int error) > +{ > + ext4_io_end_t *io_end = bio->bi_private; > + struct workqueue_struct *wq; > + struct inode *inode; > + unsigned long flags; > + struct ext4_io_page *io_page; > + > + inode = io_end->inode; > +#ifdef PDEBUG > + trace_printk("%s: enter: ino %lu io_end=%p", inode->i_sb->s_id, > + inode->i_ino, io_end); > +#endif > + bio->bi_private = NULL; > + bio->bi_end_io = NULL; > + bio_put(bio); > + > + if (!io_end) > + return; > + > + if (!(inode->i_sb->s_flags & MS_ACTIVE)) { > + printk("sb umounted, discard end_io request for inode %lu\n", > + io_end->inode->i_ino); > + ext4_free_io_end(io_end); > + return; > + } > + > + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) > + error = 0; > + if (error) > + io_end->flag |= EXT4_IO_END_ERROR; > + > + for (io_page = io_end->io_page; io_page; io_page = io_page->p_next) { > + struct page *page = io_page->p_page; > + struct buffer_head *bh, *head = page_buffers(io_page->p_page); > + > + if (error) > + SetPageError(page); > + BUG_ON(!head); > + if (head->b_size == PAGE_CACHE_SIZE) > + clear_buffer_dirty(head); > + else { > + /* > + * If block_size < page_size we need to worry > + * about partial writes. > + */ > + loff_t offset; > + loff_t io_end_offset = io_end->offset + io_end->size; > + int partial = 0; > + > + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; > + bh = head; > + do { > + if ((offset >= io_end->offset) && > + (offset+bh->b_size <= io_end_offset)) > + clear_buffer_dirty(bh); > + if (buffer_dirty(bh)) > + partial = 1; > + offset += bh->b_size; > + bh = bh->b_this_page; > + } while (bh != head); > + if (partial) > + continue; > + } > +#ifdef PDEBUG > + trace_printk("%s: end_page_writeback for %lu:%lu\n", > + inode->i_sb->s_id, inode->i_ino, > + (unsigned long) page->index); > +#endif > + end_page_writeback(page); > + } > + > + /* Add the io_end to per-inode completed io list*/ > + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); > + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); > + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); > + > + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; > + /* queue the work to convert unwritten extents to written */ > + queue_work(wq, &io_end->work); > +#ifdef PDEBUG > + trace_printk("%s: exit: ino %lu", inode->i_sb->s_id, > + io_end->inode->i_ino); > +#endif > +} > + > +void ext4_io_submit(struct ext4_io_submit *io) > +{ > + if (!io->io_bio) > + return; > +#ifdef PDEBUG > + trace_printk("%s: io submitted io_end %p\n", > + io->io_end->inode->i_sb->s_id, io->io_end); > +#endif > + submit_bio(io->io_op, io->io_bio); > + ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); > + bio_put(io->io_bio); The extra get/put is only done for the duration of the ASSERT above, right? I'd put a comment. And why don't you just call the _get here just before submit_bio instead of down at io_submit_init. > + io->io_bio = 0; > + io->io_op = 0; > + io->io_end = 0; > +} > + > +static void io_submit_init(struct ext4_io_submit *io, > + struct inode *inode, > + struct writeback_control *wbc, > + struct buffer_head *bh) > +{ > + ext4_io_end_t *io_end; > + struct page *page = bh->b_page; > + int nvecs = bio_get_nr_vecs(bh->b_bdev); > + struct bio *bio; > + > + do { > + bio = bio_alloc(GFP_NOIO, nvecs); > + nvecs >>= 1; > + } while (bio == NULL); This is surly bad. bio_alloc must be allowed to fail (Specially with GFP_NOIO). You should only loop down to 1 and then prepare to return -ENOMEM from this function and handle it properly in callers. (Or schedule and wait like below) > + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > + bio_get(bio); > + > +retry_alloc: > + io_end = ext4_init_io_end(inode, GFP_NOFS); Or you could move the bio allocation to ext4_init_io_end() > + if (!io_end) { > + if (printk_ratelimit()) > + ext4_msg(inode->i_sb, KERN_WARNING, > + "%s: allocation fail\n", __func__); > + schedule(); > + goto retry_alloc; > + } > + io_end->inode = inode; > + io_end->offset = (sector_t)page->index << PAGE_CACHE_SHIFT; > + bio->bi_private = io->io_end = io_end; > + bio->bi_end_io = ext4_end_bio; > + > + io->io_bio = bio; > + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? > + WRITE_SYNC_PLUG : WRITE); > + io->io_next_block = bh->b_blocknr; > +#ifdef PDEBUG > + trace_printk("%s: io_submit_init for ino %lu, nvecs = %d", > + inode->i_sb->s_id, inode->i_ino, nvecs); > +#endif > +} > + > +static noinline_for_stack > +void io_submit_add_bh(struct ext4_io_submit *io, > + struct inode *inode, > + struct writeback_control *wbc, > + struct buffer_head *bh) > +{ > + struct ext4_io_page *new_io_page; > + int ret; > + > +#ifdef PDEBUG > + trace_printk("%s enter: ino %lu blk %lu", inode->i_sb->s_id, > + inode->i_ino, (unsigned long) bh->b_blocknr); > +#endif > + if (buffer_new(bh)) { > + clear_buffer_new(bh); > + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); > + } > + > +retry: > + if (io->io_bio && bh->b_blocknr != io->io_next_block) > + ext4_io_submit(io); > + if (io->io_bio == NULL) > + io_submit_init(io, inode, wbc, bh); > + if (buffer_uninit(bh)) > + io->io_end->flag |= EXT4_IO_END_UNWRITTEN; > + io->io_end->size += bh->b_size; > + io->io_next_block++; > + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); > + if (ret != bh->b_size) { > + submit_and_retry: > +#ifdef PDEBUG > + trace_printk("%s: submit and retry (ret = %d, size=%d, " > + "offset=%lu)\n", inode->i_sb->s_id, ret, > + bh->b_size, bh_offset(bh)); > +#endif > + ext4_io_submit(io); > + goto retry; > + } > + if (!io->io_end->io_page || io->io_end->io_page->p_page != bh->b_page) { > + new_io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); > + if (!new_io_page) > + goto submit_and_retry; > + get_page(bh->b_page); > + new_io_page->p_page = bh->b_page; > + new_io_page->p_next = io->io_end->io_page; > + io->io_end->io_page = new_io_page; > + } > +#ifdef PDEBUG > + trace_printk("%s: exit: ino %lu", inode->i_sb->s_id, inode->i_ino); > +#endif > +} > + > +int ext4_bio_write_page(struct ext4_io_submit *io, > + struct page *page, > + int len, > + struct writeback_control *wbc) > +{ > + struct inode *inode = page->mapping->host; > + unsigned block_start, block_end; > + unsigned blocksize; > + struct buffer_head *bh, *head; > + > +#ifdef PDEBUG > + trace_printk("%s: enter: ino %lu page %lu len %d", inode->i_sb->s_id, > + inode->i_ino, page->index, len); > +#endif > + blocksize = 1 << inode->i_blkbits; > + > + BUG_ON(PageWriteback(page)); > + set_page_writeback(page); > + ClearPageError(page); > + > + for (bh = head = page_buffers(page), block_start = 0; > + bh != head || !block_start; > + block_start=block_end, bh = bh->b_this_page) { > + block_end = block_start + blocksize; > + if (block_start >= len) { > + clear_buffer_dirty(bh); > + set_buffer_uptodate(bh); > + continue; > + } > + io_submit_add_bh(io, inode, wbc, bh); > + } > + unlock_page(page); > + /* > + * If the page was truncated before we could do the writeback, > + * we won't have submitted any pages for I/O. In that case we > + * need to make sure we've cleared the PageWriteback bit from > + * the page to prevent the system from wedging later on. > + */ > + if (len == 0) > + end_page_writeback(page); > +#ifdef PDEBUG > + trace_printk("%s: exit: for ino %lu", inode->i_sb->s_id, > + inode->i_ino); > +#endif > + return 0; > +} > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 16002ec..9f602c2 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -4768,9 +4768,12 @@ static int __init init_ext4_fs(void) > int err; > > ext4_check_flag_values(); > - err = init_ext4_system_zone(); > + err = init_ext4_pageio(); > if (err) > return err; > + err = init_ext4_system_zone(); > + if (err) > + goto out5; > ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); > if (!ext4_kset) > goto out4; > @@ -4811,6 +4814,8 @@ out3: > kset_unregister(ext4_kset); > out4: > exit_ext4_system_zone(); > +out5: > + exit_ext4_pageio(); > return err; > } > > @@ -4826,6 +4831,7 @@ static void __exit exit_ext4_fs(void) > remove_proc_entry("fs/ext4", NULL); > kset_unregister(ext4_kset); > exit_ext4_system_zone(); > + exit_ext4_pageio(); > } > > MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c > index 6494c81..256008b 100644 > --- a/fs/jbd2/commit.c > +++ b/fs/jbd2/commit.c > @@ -631,6 +631,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) > * (which is of type BJ_IO) > */ > JBUFFER_TRACE(jh, "ph3: write metadata"); > +#if 1 /* PDEBUG */ > + trace_printk("@635 %s block %llu\n", journal->j_devname, blocknr); > +#endif > flags = jbd2_journal_write_metadata_buffer(commit_transaction, > jh, &new_jh, blocknr); > if (flags < 0) { > @@ -693,6 +696,11 @@ start_journal_io: > clear_buffer_dirty(bh); > set_buffer_uptodate(bh); > bh->b_end_io = journal_end_buffer_io_sync; > +#if 1 /* PDEBUG */ > + trace_printk("@700 %s block %llu\n", > + journal->j_devname, > + bh->b_blocknr); > +#endif > submit_bh(write_op, bh); > } > cond_resched(); > @@ -762,6 +770,10 @@ wait_for_iobuf: > jh = commit_transaction->t_iobuf_list->b_tprev; > bh = jh2bh(jh); > if (buffer_locked(bh)) { > +#if 1 /* PDEBUG */ > + trace_printk("jbd wait_on_buffer@765: %lu\n", > + (unsigned long) bh->b_blocknr); > +#endif > wait_on_buffer(bh); > goto wait_for_iobuf; > } > @@ -818,6 +830,11 @@ wait_for_iobuf: > jh = commit_transaction->t_log_list->b_tprev; > bh = jh2bh(jh); > if (buffer_locked(bh)) { > +#if 1 /* PDEBUG */ > + trace_printk("%s: jbd wait_on_buffer@823: %lu\n", > + journal->j_devname, > + (unsigned long) bh->b_blocknr); > +#endif > wait_on_buffer(bh); > goto wait_for_ctlbuf; > } > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c > index 6bf0a24..8873caa 100644 > --- a/fs/jbd2/transaction.c > +++ b/fs/jbd2/transaction.c > @@ -701,6 +701,11 @@ repeat: > for ( ; ; ) { > prepare_to_wait(wqh, &wait.wait, > TASK_UNINTERRUPTIBLE); > +#if 1 /* PDEBUG */ > + trace_printk("%s: BJ shadow waiting on %lu\n", > + journal->j_devname, > + (unsigned long) bh->b_blocknr); > +#endif > if (jh->b_jlist != BJ_Shadow) > break; > schedule(); > diff --git a/mm/filemap.c b/mm/filemap.c > index 3d4df44..dfdbcb1 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -295,6 +295,13 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, > if (page->index > end) > continue; > > +#if 1 /* PDEBUG */ > + if (PageWriteback(page)) > + trace_printk(KERN_NOTICE "pid %d waiting on %lu:%lu\n", > + task_pid_nr(current), > + mapping->host->i_ino, > + (unsigned long) page->index); > +#endif > wait_on_page_writeback(page); > if (PageError(page)) > ret = -EIO; > -- > 1.7.1 > Other then that I can't see anything obvious Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html