"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> writes: > The range_cyclic writeback mode uses the address_space writeback_index > as the start index for writeback. With delayed allocation we were > updating writeback_index wrongly resulting in highly fragmented file. > Number of extents reduced from 4000 to 27 for a 3GB file with the below > patch. Hi i've played with fragmentation patches with following result: I've had several crash and deadlocks for example objects wasn't freed on umount: EXT4-fs: mballoc: 12800 blocks 13 reqs (6 success) EXT4-fs: mballoc: 7 extents scanned, 12 goal hits, 1 2^N hits, 0 breaks, 0 lost EXT4-fs: mballoc: 1 generated and it took 3024 EXT4-fs: mballoc: 7608 preallocated, 1536 discarded slab error in kmem_cache_destroy(): cache `ext4_prealloc_space': Can't free all objects Pid: 7703, comm: rmmod Not tainted 2.6.27-rc8 #3 Call Trace: [<ffffffff8028b011>] kmem_cache_destroy+0x7d/0xc0 [<ffffffffa03ca057>] exit_ext4_mballoc+0x10/0x1e [ext4dev] [<ffffffffa03d35b3>] exit_ext4_fs+0x1f/0x2f [ext4dev] [<ffffffff80250dff>] sys_delete_module+0x199/0x1f3 [<ffffffff8025d06e>] audit_syscall_entry+0x12d/0x160 [<ffffffff8020be0b>] system_call_fastpath+0x16/0x1b Some times sync has stuck. I'm not shure is it because of current patch set, but where is at least one brand new bug. See later. > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Theodore Ts'o <tytso@xxxxxxx> > --- > fs/ext4/inode.c | 83 +++++++++++++++++++++++++++++++++---------------------- > 1 files changed, 50 insertions(+), 33 deletions(-) > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index cba7960..844c136 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) > int ret = 0, err, nr_pages, i; > unsigned long index, end; > struct pagevec pvec; > + long pages_skipped; > > BUG_ON(mpd->next_page <= mpd->first_page); > pagevec_init(&pvec, 0); > @@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) > end = mpd->next_page - 1; [1] In fact mpd->next_page may be bigger whan (index + wbc->nr_to_write) this result in incorrect math while exit from mpage_da_writepages() Probably we have bound end_index here. end = min(mpd->next_page, index +wbc->nr_to_write) -1; > > while (index <= end) { > - /* XXX: optimize tail */ > /* > * We can use PAGECACHE_TAG_DIRTY lookup here because > * even though we have cleared the dirty flag on the page > @@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) > for (i = 0; i < nr_pages; i++) { > struct page *page = pvec.pages[i]; > > + pages_skipped = mpd->wbc->pages_skipped; > err = mapping->a_ops->writepage(page, mpd->wbc); > - if (!err) > + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) > + /* > + * have successfully written the page > + * without skipping the same > + */ > mpd->pages_written++; > /* > * In error case, we have to continue because > @@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, > struct writeback_control *wbc, > struct mpage_da_data *mpd) > { > - long to_write; > int ret; > > if (!mpd->get_block) > @@ -2125,10 +2129,7 @@ static int mpage_da_writepages(struct address_space *mapping, > mpd->pages_written = 0; > mpd->retval = 0; > > - to_write = wbc->nr_to_write; > - > ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); > - > /* > * Handle last extent of pages > */ > @@ -2137,7 +2138,7 @@ static int mpage_da_writepages(struct address_space *mapping, > mpage_da_submit_io(mpd); > } > > - wbc->nr_to_write = to_write - mpd->pages_written; > + wbc->nr_to_write -= mpd->pages_written; due to [1] wbc->nr_write becomes negative here > return ret; > } > > @@ -2366,11 +2367,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) > static int ext4_da_writepages(struct address_space *mapping, > struct writeback_control *wbc) > { > + pgoff_t index; > + int range_whole = 0; > handle_t *handle = NULL; > + long pages_written = 0; > struct mpage_da_data mpd; > struct inode *inode = mapping->host; > + int no_nrwrite_update, no_index_update; > int needed_blocks, ret = 0, nr_to_writebump = 0; > - long to_write, pages_skipped = 0; > struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); > > /* > @@ -2390,16 +2394,27 @@ static int ext4_da_writepages(struct address_space *mapping, > nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; > wbc->nr_to_write = sbi->s_mb_stream_request; > } > + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) > + range_whole = 1; > > - > - pages_skipped = wbc->pages_skipped; > + if (wbc->range_cyclic) > + index = mapping->writeback_index; > + else > + index = wbc->range_start >> PAGE_CACHE_SHIFT; > > mpd.wbc = wbc; > mpd.inode = mapping->host; > > -restart_loop: > - to_write = wbc->nr_to_write; > - while (!ret && to_write > 0) { > + /* > + * we don't want write_cache_pages to update > + * nr_to_write and writeback_index > + */ > + no_nrwrite_update = wbc->no_nrwrite_update; > + wbc->no_nrwrite_update = 1; > + no_index_update = wbc->no_index_update; > + wbc->no_index_update = 1; > + > + while (!ret && wbc->nr_to_write > 0) { > > /* > * we insert one extent at a time. So we need > @@ -2420,46 +2435,48 @@ static int ext4_da_writepages(struct address_space *mapping, > dump_stack(); > goto out_writepages; > } > - to_write -= wbc->nr_to_write; > - > mpd.get_block = ext4_da_get_block_write; > ret = mpage_da_writepages(mapping, wbc, &mpd); > > ext4_journal_stop(handle); > > - if (mpd.retval == -ENOSPC) > + if (mpd.retval == -ENOSPC) { > + /* commit the transaction which would > + * free blocks released in the transaction > + * and try again > + */ > jbd2_journal_force_commit_nested(sbi->s_journal); > - > - /* reset the retry count */ > - if (ret == MPAGE_DA_EXTENT_TAIL) { > + ret = 0; > + } else if (ret == MPAGE_DA_EXTENT_TAIL) { > /* > * got one extent now try with > * rest of the pages > */ > - to_write += wbc->nr_to_write; > + pages_written += mpd.pages_written; > ret = 0; > - } else if (wbc->nr_to_write) { > + } else if (wbc->nr_to_write) > /* > * There is no more writeout needed > * or we requested for a noblocking writeout > * and we found the device congested > */ > - to_write += wbc->nr_to_write; > break; > - } > - wbc->nr_to_write = to_write; > - } > - > - if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { > - /* We skipped pages in this loop */ > - wbc->nr_to_write = to_write + > - wbc->pages_skipped - pages_skipped; > - wbc->pages_skipped = pages_skipped; > - goto restart_loop; > } > + /* Update index */ > + index += pages_written; > + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) > + /* > + * set the writeback_index so that range_cyclic > + * mode will write it back later > + */ > + mapping->writeback_index = index; > > out_writepages: > - wbc->nr_to_write = to_write - nr_to_writebump; > + if (!no_nrwrite_update) > + wbc->no_nrwrite_update = 0; > + if (!no_index_update) > + wbc->no_index_update = 0; > + wbc->nr_to_write -= nr_to_writebump; > return ret; > } BTW: please add following cleanup fix. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6efa4ca..c248cbc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2603,7 +2603,7 @@ static int ext4_da_write_end(struct file *file, handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; - int write_mode = (int)fsdata; + int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { if (ext4_should_order_data(inode)) { -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html