On Thu, Jul 11, 2013 at 12:39:02AM -0400, Theodore Ts'o wrote: > When we read in an extent tree leaf block from disk, arrange to have > all of its entries cached. In nearly all cases the in-memory > representation will be more compact than the on-disk representation in > the buffer cache, and it allows us to get the information without > having to traverse the extent tree for successive extents. > > Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx> > Cc: Zheng Liu <wenqing.lz@xxxxxxxxxx> One minor nit below. Otherwise the patch looks good to me. Reviewed-by: Zheng Liu <wenqing.lz@xxxxxxxxxx> > --- > fs/ext4/ext4.h | 14 +++++++- > fs/ext4/extents.c | 84 +++++++++++++++++++++++++++++++-------------- > fs/ext4/extents_status.c | 39 ++++++++++++++++++++- > fs/ext4/extents_status.h | 3 ++ > fs/ext4/migrate.c | 2 +- > fs/ext4/move_extent.c | 2 +- > include/trace/events/ext4.h | 14 +++++++- > 7 files changed, 127 insertions(+), 31 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 6ed348d..a8497b0 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -561,6 +561,17 @@ enum { > #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 > > /* > + * The bit position of this flag must not overlap with any of the > + * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), > + * read_extent_tree_block(), ext4_split_extent_at(), > + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to > + * indicate that the we shouldn't be caching the extents when reading > + * from the extent tree while a truncate or punch hole operation > + * is in progress. > + */ > +#define EXT4_EX_NOCACHE 0x0400 > + > +/* > * Flags used by ext4_free_blocks > */ > #define EXT4_FREE_BLOCKS_METADATA 0x0001 > @@ -2683,7 +2694,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, > struct ext4_ext_path *, > struct ext4_extent *, int); > extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, > - struct ext4_ext_path *); > + struct ext4_ext_path *, > + int flags); > extern void ext4_ext_drop_refs(struct ext4_ext_path *); > extern int ext4_ext_check_inode(struct inode *inode); > extern int ext4_find_delalloc_range(struct inode *inode, > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index e174814..506c3fe 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -466,7 +466,8 @@ int ext4_ext_check_inode(struct inode *inode) > > static struct buffer_head * > __read_extent_tree_block(const char *function, unsigned int line, > - struct inode *inode, ext4_fsblk_t pblk, int depth) > + struct inode *inode, ext4_fsblk_t pblk, int depth, > + int flags) > { > struct buffer_head *bh; > int err; > @@ -490,6 +491,31 @@ __read_extent_tree_block(const char *function, unsigned int line, > if (err) > goto errout; > set_buffer_verified(bh); > + /* > + * If this is a leaf block, cache all of its entries > + */ > + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { > + struct ext4_extent_header *eh = ext_block_hdr(bh); > + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); > + int i; > + > + for (i = eh->eh_entries; i > 0; i--, ex++) { Here I get a warning message when I use the following command to build the ext4 kernel module. $ make M=fs/ext4 C=2 CF=-D__CHECK_ENDIAN__ CHECK fs/ext4/extents.c fs/ext4/extents.c:502:24: warning: incorrect type in assignment (different base types) fs/ext4/extents.c:502:24: expected int [signed] i fs/ext4/extents.c:502:24: got restricted __le16 [usertype] eh_entries - Zheng > + unsigned int status = EXTENT_STATUS_WRITTEN; > + ext4_lblk_t prev = 0, lblk = le32_to_cpu(ex->ee_block); > + int len = ext4_ext_get_actual_len(ex); > + > + if (prev && (prev != lblk)) > + ext4_es_cache_extent(inode, prev, > + lblk - prev, ~0, > + EXTENT_STATUS_HOLE); > + > + if (ext4_ext_is_uninitialized(ex)) > + status = EXTENT_STATUS_UNWRITTEN; > + ext4_es_cache_extent(inode, lblk, len, > + ext4_ext_pblock(ex), status); > + prev = lblk + len; > + } > + } > return bh; > errout: > put_bh(bh); > @@ -497,8 +523,9 @@ errout: > > } > > -#define read_extent_tree_block(inode, pblk, depth) \ > - __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), (depth)) > +#define read_extent_tree_block(inode, pblk, depth, flags) \ > + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ > + (depth), (flags)) > > #ifdef EXT_DEBUG > static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) > @@ -732,7 +759,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) > > struct ext4_ext_path * > ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, > - struct ext4_ext_path *path) > + struct ext4_ext_path *path, int flags) > { > struct ext4_extent_header *eh; > struct buffer_head *bh; > @@ -764,7 +791,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, > path[ppos].p_depth = i; > path[ppos].p_ext = NULL; > > - bh = read_extent_tree_block(inode, path[ppos].p_block, --i); > + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, > + flags); > if (IS_ERR(bh)) { > ret = PTR_ERR(bh); > goto err; > @@ -1201,7 +1229,8 @@ out: > * if no free index is found, then it requests in-depth growing. > */ > static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, > - unsigned int flags, > + unsigned int mb_flags, > + unsigned int gb_flags, > struct ext4_ext_path *path, > struct ext4_extent *newext) > { > @@ -1223,7 +1252,7 @@ repeat: > if (EXT_HAS_FREE_INDEX(curp)) { > /* if we found index with free entry, then use that > * entry: create all needed subtree and add new leaf */ > - err = ext4_ext_split(handle, inode, flags, path, newext, i); > + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); > if (err) > goto out; > > @@ -1231,12 +1260,12 @@ repeat: > ext4_ext_drop_refs(path); > path = ext4_ext_find_extent(inode, > (ext4_lblk_t)le32_to_cpu(newext->ee_block), > - path); > + path, gb_flags); > if (IS_ERR(path)) > err = PTR_ERR(path); > } else { > /* tree is full, time to grow in depth */ > - err = ext4_ext_grow_indepth(handle, inode, flags, newext); > + err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); > if (err) > goto out; > > @@ -1244,7 +1273,7 @@ repeat: > ext4_ext_drop_refs(path); > path = ext4_ext_find_extent(inode, > (ext4_lblk_t)le32_to_cpu(newext->ee_block), > - path); > + path, gb_flags); > if (IS_ERR(path)) { > err = PTR_ERR(path); > goto out; > @@ -1417,7 +1446,7 @@ got_index: > while (++depth < path->p_depth) { > /* subtract from p_depth to get proper eh_depth */ > bh = read_extent_tree_block(inode, *logical, > - path->p_depth - depth); > + path->p_depth - depth, 0); > if (IS_ERR(bh)) > return PTR_ERR(bh); > eh = ext_block_hdr(bh); > @@ -1426,7 +1455,7 @@ got_index: > put_bh(bh); > } > > - bh = read_extent_tree_block(inode, *logical, path->p_depth - depth); > + bh = read_extent_tree_block(inode, *logical, path->p_depth - depth, 0); > if (IS_ERR(bh)) > return PTR_ERR(bh); > eh = ext_block_hdr(bh); > @@ -1788,7 +1817,7 @@ out: > */ > int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > struct ext4_ext_path *path, > - struct ext4_extent *newext, int flag) > + struct ext4_extent *newext, int gb_flags) > { > struct ext4_extent_header *eh; > struct ext4_extent *ex, *fex; > @@ -1797,7 +1826,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > int depth, len, err; > ext4_lblk_t next; > unsigned uninitialized = 0; > - int flags = 0; > + int mb_flags = 0; > > if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { > EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); > @@ -1812,7 +1841,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > } > > /* try to insert block into found extent and return */ > - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { > + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { > > /* > * Try to see whether we should rather test the extent on > @@ -1915,7 +1944,7 @@ prepend: > if (next != EXT_MAX_BLOCKS) { > ext_debug("next leaf block - %u\n", next); > BUG_ON(npath != NULL); > - npath = ext4_ext_find_extent(inode, next, NULL); > + npath = ext4_ext_find_extent(inode, next, NULL, 0); > if (IS_ERR(npath)) > return PTR_ERR(npath); > BUG_ON(npath->p_depth != path->p_depth); > @@ -1934,9 +1963,10 @@ prepend: > * There is no free space in the found leaf. > * We're gonna add a new leaf in the tree. > */ > - if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) > - flags = EXT4_MB_USE_RESERVED; > - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); > + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) > + mb_flags = EXT4_MB_USE_RESERVED; > + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, > + path, newext); > if (err) > goto cleanup; > depth = ext_depth(inode); > @@ -2002,7 +2032,7 @@ has_space: > > merge: > /* try to merge extents */ > - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) > + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) > ext4_ext_try_to_merge(handle, inode, path, nearex); > > > @@ -2045,7 +2075,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, > path = NULL; > } > > - path = ext4_ext_find_extent(inode, block, path); > + path = ext4_ext_find_extent(inode, block, path, 0); > if (IS_ERR(path)) { > up_read(&EXT4_I(inode)->i_data_sem); > err = PTR_ERR(path); > @@ -2707,7 +2737,7 @@ again: > ext4_lblk_t ee_block; > > /* find extent for this block */ > - path = ext4_ext_find_extent(inode, end, NULL); > + path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); > if (IS_ERR(path)) { > ext4_journal_stop(handle); > return PTR_ERR(path); > @@ -2749,6 +2779,7 @@ again: > */ > err = ext4_split_extent_at(handle, inode, path, > end + 1, split_flag, > + EXT4_EX_NOCACHE | > EXT4_GET_BLOCKS_PRE_IO | > EXT4_GET_BLOCKS_METADATA_NOFAIL); > > @@ -2825,7 +2856,8 @@ again: > i + 1, ext4_idx_pblock(path[i].p_idx)); > memset(path + i + 1, 0, sizeof(*path)); > bh = read_extent_tree_block(inode, > - ext4_idx_pblock(path[i].p_idx), depth - i - 1); > + ext4_idx_pblock(path[i].p_idx), depth - i - 1, > + EXT4_EX_NOCACHE); > if (IS_ERR(bh)) { > err = PTR_ERR(bh); > break; > @@ -3168,7 +3200,7 @@ static int ext4_split_extent(handle_t *handle, > * result in split of original leaf or extent zeroout. > */ > ext4_ext_drop_refs(path); > - path = ext4_ext_find_extent(inode, map->m_lblk, path); > + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); > if (IS_ERR(path)) > return PTR_ERR(path); > depth = ext_depth(inode); > @@ -3552,7 +3584,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, > if (err < 0) > goto out; > ext4_ext_drop_refs(path); > - path = ext4_ext_find_extent(inode, map->m_lblk, path); > + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); > if (IS_ERR(path)) { > err = PTR_ERR(path); > goto out; > @@ -4039,7 +4071,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, > trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); > > /* find extent for this block */ > - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); > + path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); > if (IS_ERR(path)) { > err = PTR_ERR(path); > path = NULL; > diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c > index 7358fb3..a84804c 100644 > --- a/fs/ext4/extents_status.c > +++ b/fs/ext4/extents_status.c > @@ -417,7 +417,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, > unsigned short ee_len; > int depth, ee_status, es_status; > > - path = ext4_ext_find_extent(inode, es->es_lblk, NULL); > + path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); > if (IS_ERR(path)) > return; > > @@ -678,6 +678,43 @@ error: > } > > /* > + * ext4_es_cache_extent() inserts information into the extent status > + * tree if and only if there isn't information about the range in > + * question already. > + */ > +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, > + ext4_lblk_t len, ext4_fsblk_t pblk, > + unsigned int status) > +{ > + struct extent_status *es; > + struct extent_status newes; > + ext4_lblk_t end = lblk + len - 1; > + > + es_debug("cache [%u/%u) %llu %x to extent status tree of inode %lu\n", > + lblk, len, pblk, status, inode->i_ino); > + if (!len) > + return; > + > + BUG_ON(end < lblk); > + > + write_lock(&EXT4_I(inode)->i_es_lock); > + > + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); > + if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) > + goto out; > + > + newes.es_lblk = lblk; > + newes.es_len = len; > + ext4_es_store_pblock(&newes, pblk); > + ext4_es_store_status(&newes, status); > + trace_ext4_es_cache_extent(inode, &newes); > + > + __es_insert_extent(inode, &newes); > +out: > + write_unlock(&EXT4_I(inode)->i_es_lock); > +} > + > +/* > * ext4_es_lookup_extent() looks up an extent in extent status tree. > * > * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. > diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h > index ae1be58..23348d1 100644 > --- a/fs/ext4/extents_status.h > +++ b/fs/ext4/extents_status.h > @@ -71,6 +71,9 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); > extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, > ext4_lblk_t len, ext4_fsblk_t pblk, > unsigned int status); > +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, > + ext4_lblk_t len, ext4_fsblk_t pblk, > + unsigned int status); > extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, > ext4_lblk_t len); > extern void ext4_es_find_delayed_extent_range(struct inode *inode, > diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c > index 49e8bdf..f99bdb8 100644 > --- a/fs/ext4/migrate.c > +++ b/fs/ext4/migrate.c > @@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode, > newext.ee_block = cpu_to_le32(lb->first_block); > newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); > ext4_ext_store_pblock(&newext, lb->first_pblock); > - path = ext4_ext_find_extent(inode, lb->first_block, NULL); > + path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); > > if (IS_ERR(path)) { > retval = PTR_ERR(path); > diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c > index e86dddb..7fa4d85 100644 > --- a/fs/ext4/move_extent.c > +++ b/fs/ext4/move_extent.c > @@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, > int ret = 0; > struct ext4_ext_path *path; > > - path = ext4_ext_find_extent(inode, lblock, *orig_path); > + path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); > if (IS_ERR(path)) > ret = PTR_ERR(path); > else if (path[ext_depth(inode)].p_ext == NULL) > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h > index 47a355b..d892b55 100644 > --- a/include/trace/events/ext4.h > +++ b/include/trace/events/ext4.h > @@ -2192,7 +2192,7 @@ TRACE_EVENT(ext4_ext_remove_space_done, > (unsigned short) __entry->eh_entries) > ); > > -TRACE_EVENT(ext4_es_insert_extent, > +DECLARE_EVENT_CLASS(ext4__es_extent, > TP_PROTO(struct inode *inode, struct extent_status *es), > > TP_ARGS(inode, es), > @@ -2222,6 +2222,18 @@ TRACE_EVENT(ext4_es_insert_extent, > __entry->pblk, show_extent_status(__entry->status)) > ); > > +DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, > + TP_PROTO(struct inode *inode, struct extent_status *es), > + > + TP_ARGS(inode, es) > +); > + > +DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent, > + TP_PROTO(struct inode *inode, struct extent_status *es), > + > + TP_ARGS(inode, es) > +); > + > TRACE_EVENT(ext4_es_remove_extent, > TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), > > -- > 1.7.12.rc0.22.gcdd159b > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html