在 2008-08-11一的 15:53 +0530,Aneesh Kumar K.V写道: > With the below changes we reserve credit needed to insert only one extent > resulting from a call to single get_block. That make sure we don't take > too much journal credits during writeout. We also don't limit the pages > to write. That means we loop through the dirty pages building largest > possible contiguous block request. Then we issue a single get_block request. > We may get less block that we requested. If so we would end up not mapping > some of the buffer_heads. That means those buffer_heads are still marked delay. > Later in the writepage callback via __mpage_writepage we redirty those pages. > > We should also not limit/throttle wbc->nr_to_write in the filesystem writepages > callback. That cause wrong behaviour in generic_sync_sb_inodes caused by > wbc->nr_to_write being <= 0 > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> Updated in patch queue. > --- > fs/ext4/inode.c | 200 ++++++++++++++++++++++++++++-------------------------- > 1 files changed, 104 insertions(+), 96 deletions(-) > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 7a66bba..26e30ed 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -41,6 +41,8 @@ > #include "acl.h" > #include "ext4_extents.h" > > +#define MPAGE_DA_EXTENT_TAIL 0x01 > + > static inline int ext4_begin_ordered_truncate(struct inode *inode, > loff_t new_size) > { > @@ -1604,11 +1606,13 @@ static void ext4_da_page_release_reservation(struct page *page, > unsigned long first_page, next_page; /* extent of pages */ > get_block_t *get_block; > struct writeback_control *wbc; > + int io_done; > + long pages_written; > }; > > /* > * mpage_da_submit_io - walks through extent of pages and try to write > - * them with __mpage_writepage() > + * them with writepage() call back > * > * @mpd->inode: inode > * @mpd->first_page: first page of the extent > @@ -1623,18 +1627,11 @@ static void ext4_da_page_release_reservation(struct page *page, > static int mpage_da_submit_io(struct mpage_da_data *mpd) > { > struct address_space *mapping = mpd->inode->i_mapping; > - struct mpage_data mpd_pp = { > - .bio = NULL, > - .last_block_in_bio = 0, > - .get_block = mpd->get_block, > - .use_writepage = 1, > - }; > int ret = 0, err, nr_pages, i; > unsigned long index, end; > struct pagevec pvec; > > BUG_ON(mpd->next_page <= mpd->first_page); > - > pagevec_init(&pvec, 0); > index = mpd->first_page; > end = mpd->next_page - 1; > @@ -1652,8 +1649,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) > break; > index++; > > - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); > - > + err = mapping->a_ops->writepage(page, mpd->wbc); > + if (!err) > + mpd->pages_written++; > /* > * In error case, we have to continue because > * remaining pages are still locked > @@ -1664,9 +1662,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) > } > pagevec_release(&pvec); > } > - if (mpd_pp.bio) > - mpage_bio_submit(WRITE, mpd_pp.bio); > - > return ret; > } > > @@ -1689,7 +1684,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, > int blocks = exbh->b_size >> inode->i_blkbits; > sector_t pblock = exbh->b_blocknr, cur_logical; > struct buffer_head *head, *bh; > - unsigned long index, end; > + pgoff_t index, end; > struct pagevec pvec; > int nr_pages, i; > > @@ -1774,13 +1769,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode, > * > * The function skips space we know is already mapped to disk blocks. > * > - * The function ignores errors ->get_block() returns, thus real > - * error handling is postponed to __mpage_writepage() > */ > static void mpage_da_map_blocks(struct mpage_da_data *mpd) > { > + int err = 0; > struct buffer_head *lbh = &mpd->lbh; > - int err = 0, remain = lbh->b_size; > sector_t next = lbh->b_blocknr; > struct buffer_head new; > > @@ -1790,35 +1783,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) > if (buffer_mapped(lbh) && !buffer_delay(lbh)) > return; > > - while (remain) { > - new.b_state = lbh->b_state; > - new.b_blocknr = 0; > - new.b_size = remain; > - err = mpd->get_block(mpd->inode, next, &new, 1); > - if (err) { > - /* > - * Rather than implement own error handling > - * here, we just leave remaining blocks > - * unallocated and try again with ->writepage() > - */ > - break; > - } > - BUG_ON(new.b_size == 0); > + new.b_state = lbh->b_state; > + new.b_blocknr = 0; > + new.b_size = lbh->b_size; > + > + /* > + * If we didn't accumulate anything > + * to write simply return > + */ > + if (!new.b_size) > + return; > + err = mpd->get_block(mpd->inode, next, &new, 1); > + if (err) > + return; > + BUG_ON(new.b_size == 0); > > - if (buffer_new(&new)) > - __unmap_underlying_blocks(mpd->inode, &new); > + if (buffer_new(&new)) > + __unmap_underlying_blocks(mpd->inode, &new); > > - /* > - * If blocks are delayed marked, we need to > - * put actual blocknr and drop delayed bit > - */ > - if (buffer_delay(lbh) || buffer_unwritten(lbh)) > - mpage_put_bnr_to_bhs(mpd, next, &new); > + /* > + * If blocks are delayed marked, we need to > + * put actual blocknr and drop delayed bit > + */ > + if (buffer_delay(lbh) || buffer_unwritten(lbh)) > + mpage_put_bnr_to_bhs(mpd, next, &new); > > - /* go for the remaining blocks */ > - next += new.b_size >> mpd->inode->i_blkbits; > - remain -= new.b_size; > - } > + return; > } > > #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ > @@ -1864,13 +1854,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, > * need to flush current extent and start new one > */ > mpage_da_map_blocks(mpd); > - > - /* > - * Now start a new extent > - */ > - lbh->b_size = bh->b_size; > - lbh->b_state = bh->b_state & BH_FLAGS; > - lbh->b_blocknr = logical; > + mpage_da_submit_io(mpd); > + mpd->io_done = 1; > + return; > } > > /* > @@ -1890,17 +1876,35 @@ static int __mpage_da_writepage(struct page *page, > struct buffer_head *bh, *head, fake; > sector_t logical; > > + if (mpd->io_done) { > + /* > + * Rest of the page in the page_vec > + * redirty then and skip then. We will > + * try to to write them again after > + * starting a new transaction > + */ > + redirty_page_for_writepage(wbc, page); > + unlock_page(page); > + return MPAGE_DA_EXTENT_TAIL; > + } > /* > * Can we merge this page to current extent? > */ > if (mpd->next_page != page->index) { > /* > * Nope, we can't. So, we map non-allocated blocks > - * and start IO on them using __mpage_writepage() > + * and start IO on them using writepage() > */ > if (mpd->next_page != mpd->first_page) { > mpage_da_map_blocks(mpd); > mpage_da_submit_io(mpd); > + /* > + * skip rest of the page in the page_vec > + */ > + mpd->io_done = 1; > + redirty_page_for_writepage(wbc, page); > + unlock_page(page); > + return MPAGE_DA_EXTENT_TAIL; > } > > /* > @@ -1931,6 +1935,8 @@ static int __mpage_da_writepage(struct page *page, > set_buffer_dirty(bh); > set_buffer_uptodate(bh); > mpage_add_bh_to_extent(mpd, logical, bh); > + if (mpd->io_done) > + return MPAGE_DA_EXTENT_TAIL; > } else { > /* > * Page with regular buffer heads, just add all dirty ones > @@ -1939,8 +1945,12 @@ static int __mpage_da_writepage(struct page *page, > bh = head; > do { > BUG_ON(buffer_locked(bh)); > - if (buffer_dirty(bh)) > + if (buffer_dirty(bh) && > + (!buffer_mapped(bh) || buffer_delay(bh))) { > mpage_add_bh_to_extent(mpd, logical, bh); > + if (mpd->io_done) > + return MPAGE_DA_EXTENT_TAIL; > + } > logical++; > } while ((bh = bh->b_this_page) != head); > } > @@ -1959,22 +1969,13 @@ static int __mpage_da_writepage(struct page *page, > * > * This is a library function, which implements the writepages() > * address_space_operation. > - * > - * In order to avoid duplication of logic that deals with partial pages, > - * multiple bio per page, etc, we find non-allocated blocks, allocate > - * them with minimal calls to ->get_block() and re-use __mpage_writepage() > - * > - * It's important that we call __mpage_writepage() only once for each > - * involved page, otherwise we'd have to implement more complicated logic > - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. > - * > - * See comments to mpage_writepages() > */ > static int mpage_da_writepages(struct address_space *mapping, > struct writeback_control *wbc, > get_block_t get_block) > { > struct mpage_da_data mpd; > + long to_write; > int ret; > > if (!get_block) > @@ -1988,17 +1989,22 @@ static int mpage_da_writepages(struct address_space *mapping, > mpd.first_page = 0; > mpd.next_page = 0; > mpd.get_block = get_block; > + mpd.io_done = 0; > + mpd.pages_written = 0; > + > + to_write = wbc->nr_to_write; > > ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); > > /* > * Handle last extent of pages > */ > - if (mpd.next_page != mpd.first_page) { > + if (!mpd.io_done && mpd.next_page != mpd.first_page) { > mpage_da_map_blocks(&mpd); > mpage_da_submit_io(&mpd); > } > > + wbc->nr_to_write = to_write - mpd.pages_written; > return ret; > } > > @@ -2210,10 +2216,7 @@ static int ext4_da_writepages(struct address_space *mapping, > int ret = 0; > long to_write; > loff_t range_start = 0; > - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; > - int max_credit_blocks = ext4_journal_max_transaction_buffers(inode); > - int need_credits_per_page = ext4_writepages_trans_blocks(inode, 1); > - int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page; > + long pages_skipped = 0; > > /* > * No pages to write? This is mainly a kludge to avoid starting > @@ -2223,45 +2226,35 @@ static int ext4_da_writepages(struct address_space *mapping, > if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) > return 0; > > - if (wbc->nr_to_write > mapping->nrpages) > - wbc->nr_to_write = mapping->nrpages; > - > - to_write = wbc->nr_to_write; > - > - if (!wbc->range_cyclic) { > + if (!wbc->range_cyclic) > /* > * If range_cyclic is not set force range_cont > * and save the old writeback_index > */ > wbc->range_cont = 1; > - range_start = wbc->range_start; > - } > > - while (!ret && to_write) { > - /* > - * set the max dirty pages could be write at a time > - * to fit into the reserved transaction credits > - */ > - if (wbc->nr_to_write > max_writeback_pages) > - wbc->nr_to_write = max_writeback_pages; > + range_start = wbc->range_start; > + pages_skipped = wbc->pages_skipped; > + > +restart_loop: > + to_write = wbc->nr_to_write; > + while (!ret && to_write > 0) { > > /* > - * Estimate the worse case needed credits to write out > - * to_write pages > + * we insert one extent at a time. So we need > + * credit needed for single extent allocation. > + * journalled mode is currently not supported > + * by delalloc > */ > - needed_blocks = ext4_writepages_trans_blocks(inode, > - wbc->nr_to_write); > - while (needed_blocks > max_credit_blocks) { > - wbc->nr_to_write--; > - needed_blocks = ext4_writepages_trans_blocks(inode, > - wbc->nr_to_write); > - } > + BUG_ON(ext4_should_journal_data(inode)); > + needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); > + > /* start a new transaction*/ > handle = ext4_journal_start(inode, needed_blocks); > if (IS_ERR(handle)) { > ret = PTR_ERR(handle); > - printk(KERN_EMERG "ext4_da_writepages: jbd2_start: " > - "%ld pages, ino %lu; err %d\n", > + printk(KERN_EMERG "%s: jbd2_start: " > + "%ld pages, ino %lu; err %d\n", __func__, > wbc->nr_to_write, inode->i_ino, ret); > dump_stack(); > goto out_writepages; > @@ -2284,7 +2277,14 @@ static int ext4_da_writepages(struct address_space *mapping, > ret = mpage_da_writepages(mapping, wbc, > ext4_da_get_block_write); > ext4_journal_stop(handle); > - if (wbc->nr_to_write) { > + if (ret == MPAGE_DA_EXTENT_TAIL) { > + /* > + * got one extent now try with > + * rest of the pages > + */ > + to_write += wbc->nr_to_write; > + ret = 0; > + } else if (wbc->nr_to_write) { > /* > * There is no more writeout needed > * or we requested for a noblocking writeout > @@ -2296,10 +2296,18 @@ static int ext4_da_writepages(struct address_space *mapping, > wbc->nr_to_write = to_write; > } > > + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { > + /* We skipped pages in this loop */ > + wbc->range_start = range_start; > + wbc->nr_to_write = to_write + > + wbc->pages_skipped - pages_skipped; > + wbc->pages_skipped = pages_skipped; > + goto restart_loop; > + } > + > out_writepages: > wbc->nr_to_write = to_write; > - if (range_start) > - wbc->range_start = range_start; > + wbc->range_start = range_start; > return ret; > } > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html