On Fri, Feb 08, 2013 at 04:44:05PM +0800, Zheng Liu wrote: > From: Zheng Liu <wenqing.lz@xxxxxxxxxx> > > This commit tries to convert unwritten extents from extent status tree > in end_io callback functions and ext4_ext_direct_IO. > > Signed-off-by: Zheng Liu <wenqing.lz@xxxxxxxxxx> > Cc: "Theodore Ts'o" <tytso@xxxxxxx> > Cc: Jan kara <jack@xxxxxxx> > --- > fs/ext4/extents.c | 6 +- > fs/ext4/extents_status.c | 180 ++++++++++++++++++++++++++++++++++++++++---- > fs/ext4/extents_status.h | 2 + > fs/ext4/inode.c | 5 ++ > fs/ext4/page-io.c | 8 +- > include/trace/events/ext4.h | 25 ++++++ > 6 files changed, 208 insertions(+), 18 deletions(-) > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 9f21430..a03cabf 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -4443,8 +4443,10 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, > ret = PTR_ERR(handle); > break; > } > - ret = ext4_map_blocks(handle, inode, &map, > - EXT4_GET_BLOCKS_IO_CONVERT_EXT); > + down_write(&EXT4_I(inode)->i_data_sem); > + ret = ext4_ext_map_blocks(handle, inode, &map, > + EXT4_GET_BLOCKS_IO_CONVERT_EXT); > + up_write(&EXT4_I(inode)->i_data_sem); > if (ret <= 0) { > WARN_ON(ret <= 0); > ext4_msg(inode->i_sb, KERN_ERR, Hi Ted, This chunk is missing in your 'dev' branch of ext4. If we call ext4_map_blocks here, unwritten extent in disk will never be converted because ext4_map_blocks first tries to lookup extent status tree and the unwriten extent in this tree has been converted in end_io callback function. It always looks up a written extent in cache, and return immdiately. Please check it again. Thanks, - Zheng > diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c > index bac5286..eab8893 100644 > --- a/fs/ext4/extents_status.c > +++ b/fs/ext4/extents_status.c > @@ -248,10 +248,11 @@ ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode, > struct extent_status *es1 = NULL; > struct rb_node *node; > ext4_lblk_t ret = EXT_MAX_BLOCKS; > + unsigned long flags; > > trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk); > > - read_lock(&EXT4_I(inode)->i_es_lock); > + read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags); > tree = &EXT4_I(inode)->i_es_tree; > > /* find extent in cache firstly */ > @@ -291,7 +292,7 @@ out: > } > } > > - read_unlock(&EXT4_I(inode)->i_es_lock); > + read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags); > > ext4_es_lru_add(inode); > trace_ext4_es_find_delayed_extent_exit(inode, es, ret); > @@ -458,6 +459,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, > { > struct extent_status newes; > ext4_lblk_t end = lblk + len - 1; > + unsigned long flags; > int err = 0; > > es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", > @@ -471,14 +473,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, > ext4_es_store_status(&newes, status); > trace_ext4_es_insert_extent(inode, &newes); > > - write_lock(&EXT4_I(inode)->i_es_lock); > + write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags); > err = __es_remove_extent(inode, lblk, end); > if (err != 0) > goto error; > err = __es_insert_extent(inode, &newes); > > error: > - write_unlock(&EXT4_I(inode)->i_es_lock); > + write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags); > > ext4_es_lru_add(inode); > ext4_es_print_tree(inode); > @@ -498,13 +500,14 @@ int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es) > struct ext4_es_tree *tree; > struct extent_status *es1 = NULL; > struct rb_node *node; > + unsigned long flags; > int found = 0; > > trace_ext4_es_lookup_extent_enter(inode, es->es_lblk); > es_debug("lookup extent in block %u\n", es->es_lblk); > > tree = &EXT4_I(inode)->i_es_tree; > - read_lock(&EXT4_I(inode)->i_es_lock); > + read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags); > > /* find extent in cache firstly */ > es->es_len = es->es_pblk = 0; > @@ -539,7 +542,7 @@ out: > es->es_pblk = es1->es_pblk; > } > > - read_unlock(&EXT4_I(inode)->i_es_lock); > + read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags); > > ext4_es_lru_add(inode); > trace_ext4_es_lookup_extent_exit(inode, es, found); > @@ -649,6 +652,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, > ext4_lblk_t len) > { > ext4_lblk_t end; > + unsigned long flags; > int err = 0; > > trace_ext4_es_remove_extent(inode, lblk, len); > @@ -658,9 +662,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, > end = lblk + len - 1; > BUG_ON(end < lblk); > > - write_lock(&EXT4_I(inode)->i_es_lock); > + write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags); > err = __es_remove_extent(inode, lblk, end); > - write_unlock(&EXT4_I(inode)->i_es_lock); > + write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags); > ext4_es_print_tree(inode); > return err; > } > @@ -671,6 +675,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) > struct ext4_sb_info, s_es_shrinker); > struct ext4_inode_info *ei; > struct list_head *cur, *tmp, scanned; > + unsigned long flags; > int nr_to_scan = sc->nr_to_scan; > int ret, nr_shrunk = 0; > > @@ -687,16 +692,16 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) > > ei = list_entry(cur, struct ext4_inode_info, i_es_lru); > > - read_lock(&ei->i_es_lock); > + read_lock_irqsave(&ei->i_es_lock, flags); > if (ei->i_es_lru_nr == 0) { > - read_unlock(&ei->i_es_lock); > + read_unlock_irqrestore(&ei->i_es_lock, flags); > continue; > } > - read_unlock(&ei->i_es_lock); > + read_unlock_irqrestore(&ei->i_es_lock, flags); > > - write_lock(&ei->i_es_lock); > + write_lock_irqsave(&ei->i_es_lock, flags); > ret = __es_try_to_reclaim_extents(ei, nr_to_scan); > - write_unlock(&ei->i_es_lock); > + write_unlock_irqrestore(&ei->i_es_lock, flags); > > nr_shrunk += ret; > nr_to_scan -= ret; > @@ -756,14 +761,15 @@ static int ext4_es_reclaim_extents_count(struct super_block *sb) > struct ext4_sb_info *sbi = EXT4_SB(sb); > struct ext4_inode_info *ei; > struct list_head *cur; > + unsigned long flags; > int nr_cached = 0; > > spin_lock(&sbi->s_es_lru_lock); > list_for_each(cur, &sbi->s_es_lru) { > ei = list_entry(cur, struct ext4_inode_info, i_es_lru); > - read_lock(&ei->i_es_lock); > + read_lock_irqsave(&ei->i_es_lock, flags); > nr_cached += ei->i_es_lru_nr; > - read_unlock(&ei->i_es_lock); > + read_unlock_irqrestore(&ei->i_es_lock, flags); > } > spin_unlock(&sbi->s_es_lru_lock); > trace_ext4_es_reclaim_extents_count(sb, nr_cached); > @@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, > tree->cache_es = NULL; > return nr_shrunk; > } > + > +int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset, > + size_t size) > +{ > + struct ext4_es_tree *tree; > + struct rb_node *node; > + struct extent_status *es, orig_es, conv_es; > + ext4_lblk_t end, len1, len2; > + ext4_lblk_t lblk = 0, len = 0; > + ext4_fsblk_t block; > + unsigned long flags; > + unsigned int blkbits; > + int err = 0; > + > + trace_ext4_es_convert_unwritten_extents(inode, offset, size); > + blkbits = inode->i_blkbits; > + lblk = offset >> blkbits; > + len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk; > + > + end = lblk + len - 1; > + BUG_ON(end < lblk); > + > + tree = &EXT4_I(inode)->i_es_tree; > + > + write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags); > + es = __es_tree_search(&tree->root, lblk); > + if (!es) > + goto out; > + if (es->es_lblk > end) > + goto out; > + > + tree->cache_es = NULL; > + > + orig_es.es_lblk = es->es_lblk; > + orig_es.es_len = es->es_len; > + orig_es.es_pblk = es->es_pblk; > + > + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; > + len2 = ext4_es_end(es) > end ? > + ext4_es_end(es) - end : 0; > + if (len1 > 0) > + es->es_len = len1; > + if (len2 > 0) { > + if (len1 > 0) { > + struct extent_status newes; > + > + newes.es_lblk = end + 1; > + newes.es_len = len2; > + block = ext4_es_pblock(&orig_es) + > + orig_es.es_len - len2; > + ext4_es_store_pblock(&newes, block); > + ext4_es_store_status(&newes, ext4_es_status(&orig_es)); > + err = __es_insert_extent(inode, &newes); > + if (err) { > + es->es_lblk = orig_es.es_lblk; > + es->es_len = orig_es.es_len; > + es->es_pblk = orig_es.es_pblk; > + goto out; > + } > + > + conv_es.es_lblk = orig_es.es_lblk + len1; > + conv_es.es_len = orig_es.es_len - len1 - len2; > + block = ext4_es_pblock(&orig_es) + len1; > + ext4_es_store_pblock(&conv_es, block); > + ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN); > + err = __es_insert_extent(inode, &conv_es); > + if (err) { > + int err2 = __es_remove_extent(inode, > + conv_es.es_lblk, > + ext4_es_end(&newes)); > + if (err2) > + goto out; > + es->es_lblk = orig_es.es_lblk; > + es->es_len = orig_es.es_len; > + es->es_pblk = orig_es.es_pblk; > + goto out; > + } > + } else { > + es->es_lblk = end + 1; > + es->es_len = len2; > + block = ext4_es_pblock(&orig_es) + > + orig_es.es_len - len2; > + ext4_es_store_pblock(es, block); > + > + conv_es.es_lblk = orig_es.es_lblk; > + conv_es.es_len = orig_es.es_len - len2; > + ext4_es_store_pblock(&conv_es, > + ext4_es_pblock(&orig_es)); > + ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN); > + err = __es_insert_extent(inode, &conv_es); > + if (err) { > + es->es_lblk = orig_es.es_lblk; > + es->es_len = orig_es.es_len; > + es->es_pblk = orig_es.es_pblk; > + } > + } > + goto out; > + } > + > + if (len1 > 0) { > + node = rb_next(&es->rb_node); > + if (node) > + es = rb_entry(node, struct extent_status, rb_node); > + else > + es = NULL; > + } > + > + while (es && ext4_es_end(es) <= end) { > + node = rb_next(&es->rb_node); > + ext4_es_store_status(es, EXTENT_STATUS_WRITTEN); > + if (!inode) { > + es = NULL; > + break; > + } > + es = rb_entry(node, struct extent_status, rb_node); > + } > + > + if (es && es->es_lblk < end + 1) { > + ext4_lblk_t orig_len = es->es_len; > + > + /* > + * Here we first set conv_es just because of avoiding copy the > + * value of es to a temporary variable. > + */ > + len1 = ext4_es_end(es) - end; > + conv_es.es_lblk = es->es_lblk; > + conv_es.es_len = es->es_len - len1; > + ext4_es_store_pblock(&conv_es, ext4_es_pblock(es)); > + ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN); > + > + es->es_lblk = end + 1; > + es->es_len = len1; > + block = ext4_es_pblock(es) + orig_len - len1; > + ext4_es_store_pblock(es, block); > + > + err = __es_insert_extent(inode, &conv_es); > + if (err) > + goto out; > + } > + > +out: > + write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags); > + return err; > +} > diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h > index 938ad2b..2849d74 100644 > --- a/fs/ext4/extents_status.h > +++ b/fs/ext4/extents_status.h > @@ -54,6 +54,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, > extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode, > struct extent_status *es); > extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es); > +extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset, > + size_t size); > > static inline int ext4_es_is_written(struct extent_status *es) > { > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 670779a..08cf720 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -3063,6 +3063,7 @@ out: > io_end->result = ret; > } > > + ext4_es_convert_unwritten_extents(inode, offset, size); > ext4_add_complete_io(io_end); > } > > @@ -3088,6 +3089,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) > */ > inode = io_end->inode; > ext4_set_io_unwritten_flag(inode, io_end); > + ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size); > ext4_add_complete_io(io_end); > out: > bh->b_private = NULL; > @@ -3246,6 +3248,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, > } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, > EXT4_STATE_DIO_UNWRITTEN)) { > int err; > + err = ext4_es_convert_unwritten_extents(inode, offset, ret); > + if (err) > + ret = err; > /* > * for non AIO case, since the IO is already > * completed, we could do the conversion right here > diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c > index 0016fbc..66ea30e 100644 > --- a/fs/ext4/page-io.c > +++ b/fs/ext4/page-io.c > @@ -276,6 +276,13 @@ static void ext4_end_bio(struct bio *bio, int error) > error = 0; > bio_put(bio); > > + /* > + * We need to convert unwrittne extents in extent status tree before > + * end_page_writeback() is called. Otherwise, when dioread_nolock is > + * enabled, we will be likely to read stale data. > + */ > + inode = io_end->inode; > + ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size); > for (i = 0; i < io_end->num_io_pages; i++) { > struct page *page = io_end->pages[i]->p_page; > struct buffer_head *bh, *head; > @@ -305,7 +312,6 @@ static void ext4_end_bio(struct bio *bio, int error) > put_io_page(io_end->pages[i]); > } > io_end->num_io_pages = 0; > - inode = io_end->inode; > > if (error) { > io_end->flag |= EXT4_IO_END_ERROR; > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h > index f0734b3..d32e3d5 100644 > --- a/include/trace/events/ext4.h > +++ b/include/trace/events/ext4.h > @@ -2233,6 +2233,31 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, > __entry->found ? __entry->status : 0) > ); > > +TRACE_EVENT(ext4_es_convert_unwritten_extents, > + TP_PROTO(struct inode *inode, loff_t offset, loff_t size), > + > + TP_ARGS(inode, offset, size), > + > + TP_STRUCT__entry( > + __field( dev_t, dev ) > + __field( ino_t, ino ) > + __field( loff_t, offset ) > + __field( loff_t, size ) > + ), > + > + TP_fast_assign( > + __entry->dev = inode->i_sb->s_dev; > + __entry->ino = inode->i_ino; > + __entry->offset = offset; > + __entry->size = size; > + ), > + > + TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + (unsigned long) __entry->ino, > + __entry->offset, __entry->size) > +); > + > TRACE_EVENT(ext4_es_reclaim_extents_count, > TP_PROTO(struct super_block *sb, int nr_cached), > > -- > 1.7.12.rc2.18.g61b472e > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html