On Fri, May 13, 2011 at 5:26 AM, Mingming Cao <cmm@xxxxxxxxxx> wrote: > On Mon, 2011-05-02 at 19:05 -0700, Yongqiang Yang wrote: >> v0->v1: >> -- ext4_ext_convert_initialized() zeroout whole extent when the extent's >> length is less than 14. >> >> convert and split unwritten are reimplemented based on ext4_split_extent() >> added in last patch. >> >> Signed-off-by: Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> >> Tested-by: Allison Henderson <achender@xxxxxxxxxxxxxxxxxx> >> --- >> fs/ext4/extents.c | 480 ++++++++--------------------------------------------- >> 1 files changed, 72 insertions(+), 408 deletions(-) >> >> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c >> index db1d67c..9e7c7b3 100644 >> --- a/fs/ext4/extents.c >> +++ b/fs/ext4/extents.c >> @@ -2757,17 +2757,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, >> struct ext4_map_blocks *map, >> struct ext4_ext_path *path) >> { >> - struct ext4_extent *ex, newex, orig_ex; >> - struct ext4_extent *ex1 = NULL; >> - struct ext4_extent *ex2 = NULL; >> - struct ext4_extent *ex3 = NULL; >> - struct ext4_extent_header *eh; >> + struct ext4_map_blocks split_map; >> + struct ext4_extent zero_ex; >> + struct ext4_extent *ex; >> ext4_lblk_t ee_block, eof_block; >> unsigned int allocated, ee_len, depth; >> - ext4_fsblk_t newblock; >> int err = 0; >> - int ret = 0; >> - int may_zeroout; >> + int split_flag = 0; >> >> ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" >> "block %llu, max_blocks %u\n", inode->i_ino, >> @@ -2779,280 +2775,87 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, >> eof_block = map->m_lblk + map->m_len; >> >> depth = ext_depth(inode); >> - eh = path[depth].p_hdr; >> ex = path[depth].p_ext; >> ee_block = le32_to_cpu(ex->ee_block); >> ee_len = ext4_ext_get_actual_len(ex); >> allocated = ee_len - (map->m_lblk - ee_block); >> - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); >> - >> - ex2 = ex; >> - orig_ex.ee_block = ex->ee_block; >> - orig_ex.ee_len = cpu_to_le16(ee_len); >> - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); >> >> + WARN_ON(map->m_lblk < ee_block); >> /* >> * It is safe to convert extent to initialized via explicit >> * zeroout only if extent is fully insde i_size or new_size. >> */ >> - may_zeroout = ee_block + ee_len <= eof_block; >> + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; >> >> - err = ext4_ext_get_access(handle, inode, path + depth); >> - if (err) >> - goto out; >> /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ >> - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && >> + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { >> + err = ext4_ext_zeroout(inode, ex); >> if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zeroed the full extent */ >> - return allocated; >> - } >> - >> - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ >> - if (map->m_lblk > ee_block) { >> - ex1 = ex; >> - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); >> - ext4_ext_mark_uninitialized(ex1); >> - ex2 = &newex; >> - } >> - /* >> - * for sanity, update the length of the ex2 extent before >> - * we insert ex3, if ex1 is NULL. This is to avoid temporary >> - * overlap of blocks. >> - */ >> - if (!ex1 && allocated > map->m_len) >> - ex2->ee_len = cpu_to_le16(map->m_len); >> - /* ex3: to ee_block + ee_len : uninitialised */ >> - if (allocated > map->m_len) { >> - unsigned int newdepth; >> - /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ >> - if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { >> - /* >> - * map->m_lblk == ee_block is handled by the zerouout >> - * at the beginning. >> - * Mark first half uninitialized. >> - * Mark second half initialized and zero out the >> - * initialized extent >> - */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = cpu_to_le16(ee_len - allocated); >> - ext4_ext_mark_uninitialized(ex); >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - >> - ex3 = &newex; >> - ex3->ee_block = cpu_to_le32(map->m_lblk); >> - ext4_ext_store_pblock(ex3, newblock); >> - ex3->ee_len = cpu_to_le16(allocated); >> - err = ext4_ext_insert_extent(handle, inode, path, >> - ex3, 0); >> - if (err == -ENOSPC) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, >> - ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* blocks available from map->m_lblk */ >> - return allocated; >> - >> - } else if (err) >> - goto fix_extent_len; >> - >> - /* >> - * We need to zero out the second half because >> - * an fallocate request can update file size and >> - * converting the second half to initialized extent >> - * implies that we can leak some junk data to user >> - * space. >> - */ >> - err = ext4_ext_zeroout(inode, ex3); >> - if (err) { >> - /* >> - * We should actually mark the >> - * second half as uninit and return error >> - * Insert would have changed the extent >> - */ >> - depth = ext_depth(inode); >> - ext4_ext_drop_refs(path); >> - path = ext4_ext_find_extent(inode, map->m_lblk, >> - path); >> - if (IS_ERR(path)) { >> - err = PTR_ERR(path); >> - return err; >> - } >> - /* get the second half extent details */ >> - ex = path[depth].p_ext; >> - err = ext4_ext_get_access(handle, inode, >> - path + depth); >> - if (err) >> - return err; >> - ext4_ext_mark_uninitialized(ex); >> - ext4_ext_dirty(handle, inode, path + depth); >> - return err; >> - } >> - >> - /* zeroed the second half */ >> - return allocated; >> - } >> - ex3 = &newex; >> - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); >> - ext4_ext_store_pblock(ex3, newblock + map->m_len); >> - ex3->ee_len = cpu_to_le16(allocated - map->m_len); >> - ext4_ext_mark_uninitialized(ex3); >> - err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); >> - if (err == -ENOSPC && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zeroed the full extent */ >> - /* blocks available from map->m_lblk */ >> - return allocated; >> - >> - } else if (err) >> - goto fix_extent_len; >> - /* >> - * The depth, and hence eh & ex might change >> - * as part of the insert above. >> - */ >> - newdepth = ext_depth(inode); >> - /* >> - * update the extent length after successful insert of the >> - * split extent >> - */ >> - ee_len -= ext4_ext_get_actual_len(ex3); >> - orig_ex.ee_len = cpu_to_le16(ee_len); >> - may_zeroout = ee_block + ee_len <= eof_block; >> - >> - depth = newdepth; >> - ext4_ext_drop_refs(path); >> - path = ext4_ext_find_extent(inode, map->m_lblk, path); >> - if (IS_ERR(path)) { >> - err = PTR_ERR(path); >> goto out; >> - } >> - eh = path[depth].p_hdr; >> - ex = path[depth].p_ext; >> - if (ex2 != &newex) >> - ex2 = ex; >> >> err = ext4_ext_get_access(handle, inode, path + depth); >> if (err) >> goto out; >> - >> - allocated = map->m_len; >> - >> - /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying >> - * to insert a extent in the middle zerout directly >> - * otherwise give the extent a chance to merge to left >> - */ >> - if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && >> - map->m_lblk != ee_block && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zero out the first half */ >> - /* blocks available from map->m_lblk */ >> - return allocated; >> - } >> - } >> - /* >> - * If there was a change of depth as part of the >> - * insertion of ex3 above, we need to update the length >> - * of the ex1 extent again here >> - */ >> - if (ex1 && ex1 != ex) { >> - ex1 = ex; >> - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); >> - ext4_ext_mark_uninitialized(ex1); >> - ex2 = &newex; >> - } >> - /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ >> - ex2->ee_block = cpu_to_le32(map->m_lblk); >> - ext4_ext_store_pblock(ex2, newblock); >> - ex2->ee_len = cpu_to_le16(allocated); >> - if (ex2 != ex) >> - goto insert; >> - /* >> - * New (initialized) extent starts from the first block >> - * in the current extent. i.e., ex2 == ex >> - * We have to see if it can be merged with the extent >> - * on the left. >> - */ >> - if (ex2 > EXT_FIRST_EXTENT(eh)) { >> - /* >> - * To merge left, pass "ex2 - 1" to try_to_merge(), >> - * since it merges towards right _only_. >> - */ >> - ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); >> - if (ret) { >> - err = ext4_ext_correct_indexes(handle, inode, path); >> - if (err) >> - goto out; >> - depth = ext_depth(inode); >> - ex2--; >> - } >> + ext4_ext_mark_initialized(ex); >> + ext4_ext_try_to_merge(inode, path, ex); >> + err = ext4_ext_dirty(handle, inode, path + depth); >> + goto out; >> } >> + >> /* >> - * Try to Merge towards right. This might be required >> - * only when the whole extent is being written to. >> - * i.e. ex2 == ex and ex3 == NULL. >> + * four cases: >> + * 1. split the extent into three extents. >> + * 2. split the extent into two extents, zeroout the first half. >> + * 3. split the extent into two extents, zeroout the second half. >> + * 4. split the extent into two extents with out zeroout. >> */ >> - if (!ex3) { >> - ret = ext4_ext_try_to_merge(inode, path, ex2); >> - if (ret) { >> - err = ext4_ext_correct_indexes(handle, inode, path); >> + split_map.m_lblk = map->m_lblk; >> + split_map.m_len = map->m_len; >> + >> + if (allocated > map->m_len) { >> + if (allocated <= EXT4_EXT_ZERO_LEN && >> + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { >> + /* case 3 */ >> + zero_ex.ee_block = >> + cpu_to_le32(map->m_lblk + map->m_len); >> + zero_ex.ee_len = cpu_to_le16(allocated - map->m_len); > Hmm, the original code zero out the entire [map->m_lblk, allocated], > where here we only zero out a portion of it. it doesnt match the split > len below also. Yeah, I just zero out a portion of it which is not the requested. I think the requested part will have non-zero data. > > >> + ext4_ext_store_pblock(&zero_ex, >> + ext4_ext_pblock(ex) + map->m_lblk - ee_block); >> + err = ext4_ext_zeroout(inode, &zero_ex); >> if (err) >> goto out; >> + split_map.m_lblk = map->m_lblk; >> + split_map.m_len = allocated; >> + } else if ((map->m_lblk - ee_block + map->m_len < >> + EXT4_EXT_ZERO_LEN) && >> + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { >> + /* case 2 */ >> + if (map->m_lblk != ee_block) { >> + zero_ex.ee_block = ex->ee_block; >> + zero_ex.ee_len = cpu_to_le16(map->m_lblk - >> + ee_block); > similar to above, the original code zero out the entire [ex->ee_block, > map->m_lblk - ee_block + map->m_len], where here we only zero out a > portion of it. same to the mismatch of the split len also. Similar to above. Just a optimization. > > >> + ext4_ext_store_pblock(&zero_ex, >> + ext4_ext_pblock(ex)); >> + err = ext4_ext_zeroout(inode, &zero_ex); >> + if (err) >> + goto out; >> + } >> + >> - allocated = map->m_lblk - ee_block + map->m_len; >> + >> + split_map.m_lblk = ee_block; >> + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; > > I am also puzzled whether the zeroed-out extent get marked as > initialized, as done in original patch. The whole point of zero out is > to avoid frequent split of the unitizlized extent if the extent is > short. I will take a closer look at the previous patch. > > Another issue, upon success, "allocated" will return from this function. > But here allocated is the zero out length that start from ee_block, not > the length from map->m_lblk. this is wrong, the caller > ext4_ext_map_blocks expecting the length of mapped blocks from > map->m_lblk. We now return more mapped blocks than what really done. I > suspect the fsx error come from this bug. Yeah. it is a bug. Hi Allison, Could you test with modification above? Here is a bug. I will also test it. Thank you. > >> } > > >> } >> - /* Mark modified extent as dirty */ >> - err = ext4_ext_dirty(handle, inode, path + depth); >> - goto out; >> -insert: >> - err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); >> - if (err == -ENOSPC && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zero out the first half */ >> - return allocated; >> - } else if (err) >> - goto fix_extent_len; >> + >> + allocated = ext4_split_extent(handle, inode, path, >> + &split_map, split_flag, 0); >> + if (allocated < 0) >> + err = allocated; >> + >> out: >> - ext4_ext_show_leaf(inode, path); >> return err ? err : allocated; >> - >> -fix_extent_len: >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_mark_uninitialized(ex); >> - ext4_ext_dirty(handle, inode, path + depth); >> - return err; >> } >> >> /* >> @@ -3083,15 +2886,11 @@ static int ext4_split_unwritten_extents(handle_t *handle, >> struct ext4_ext_path *path, >> int flags) >> { >> - struct ext4_extent *ex, newex, orig_ex; >> - struct ext4_extent *ex1 = NULL; >> - struct ext4_extent *ex2 = NULL; >> - struct ext4_extent *ex3 = NULL; >> - ext4_lblk_t ee_block, eof_block; >> - unsigned int allocated, ee_len, depth; >> - ext4_fsblk_t newblock; >> - int err = 0; >> - int may_zeroout; >> + ext4_lblk_t eof_block; >> + ext4_lblk_t ee_block; >> + struct ext4_extent *ex; >> + unsigned int ee_len; >> + int split_flag = 0, depth; >> >> ext_debug("ext4_split_unwritten_extents: inode %lu, logical" >> "block %llu, max_blocks %u\n", inode->i_ino, >> @@ -3101,155 +2900,20 @@ static int ext4_split_unwritten_extents(handle_t *handle, >> inode->i_sb->s_blocksize_bits; >> if (eof_block < map->m_lblk + map->m_len) >> eof_block = map->m_lblk + map->m_len; >> - >> - depth = ext_depth(inode); >> - ex = path[depth].p_ext; >> - ee_block = le32_to_cpu(ex->ee_block); >> - ee_len = ext4_ext_get_actual_len(ex); >> - allocated = ee_len - (map->m_lblk - ee_block); >> - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); >> - >> - ex2 = ex; >> - orig_ex.ee_block = ex->ee_block; >> - orig_ex.ee_len = cpu_to_le16(ee_len); >> - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); >> - >> /* >> * It is safe to convert extent to initialized via explicit >> * zeroout only if extent is fully insde i_size or new_size. >> */ >> - may_zeroout = ee_block + ee_len <= eof_block; >> - >> - /* >> - * If the uninitialized extent begins at the same logical >> - * block where the write begins, and the write completely >> - * covers the extent, then we don't need to split it. >> - */ >> - if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) >> - return allocated; >> - >> - err = ext4_ext_get_access(handle, inode, path + depth); >> - if (err) >> - goto out; >> - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ >> - if (map->m_lblk > ee_block) { >> - ex1 = ex; >> - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); >> - ext4_ext_mark_uninitialized(ex1); >> - ex2 = &newex; >> - } >> - /* >> - * for sanity, update the length of the ex2 extent before >> - * we insert ex3, if ex1 is NULL. This is to avoid temporary >> - * overlap of blocks. >> - */ >> - if (!ex1 && allocated > map->m_len) >> - ex2->ee_len = cpu_to_le16(map->m_len); >> - /* ex3: to ee_block + ee_len : uninitialised */ >> - if (allocated > map->m_len) { >> - unsigned int newdepth; >> - ex3 = &newex; >> - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); >> - ext4_ext_store_pblock(ex3, newblock + map->m_len); >> - ex3->ee_len = cpu_to_le16(allocated - map->m_len); >> - ext4_ext_mark_uninitialized(ex3); >> - err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); >> - if (err == -ENOSPC && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zeroed the full extent */ >> - /* blocks available from map->m_lblk */ >> - return allocated; >> - >> - } else if (err) >> - goto fix_extent_len; >> - /* >> - * The depth, and hence eh & ex might change >> - * as part of the insert above. >> - */ >> - newdepth = ext_depth(inode); >> - /* >> - * update the extent length after successful insert of the >> - * split extent >> - */ >> - ee_len -= ext4_ext_get_actual_len(ex3); >> - orig_ex.ee_len = cpu_to_le16(ee_len); >> - may_zeroout = ee_block + ee_len <= eof_block; >> - >> - depth = newdepth; >> - ext4_ext_drop_refs(path); >> - path = ext4_ext_find_extent(inode, map->m_lblk, path); >> - if (IS_ERR(path)) { >> - err = PTR_ERR(path); >> - goto out; >> - } >> - ex = path[depth].p_ext; >> - if (ex2 != &newex) >> - ex2 = ex; >> + depth = ext_depth(inode); >> + ex = path[depth].p_ext; >> + ee_block = le32_to_cpu(ex->ee_block); >> + ee_len = ext4_ext_get_actual_len(ex); >> >> - err = ext4_ext_get_access(handle, inode, path + depth); >> - if (err) >> - goto out; >> + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; >> + split_flag |= EXT4_EXT_MARK_UNINIT2; >> >> - allocated = map->m_len; >> - } >> - /* >> - * If there was a change of depth as part of the >> - * insertion of ex3 above, we need to update the length >> - * of the ex1 extent again here >> - */ >> - if (ex1 && ex1 != ex) { >> - ex1 = ex; >> - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); >> - ext4_ext_mark_uninitialized(ex1); >> - ex2 = &newex; >> - } >> - /* >> - * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written >> - * using direct I/O, uninitialised still. >> - */ >> - ex2->ee_block = cpu_to_le32(map->m_lblk); >> - ext4_ext_store_pblock(ex2, newblock); >> - ex2->ee_len = cpu_to_le16(allocated); >> - ext4_ext_mark_uninitialized(ex2); >> - if (ex2 != ex) >> - goto insert; >> - /* Mark modified extent as dirty */ >> - err = ext4_ext_dirty(handle, inode, path + depth); >> - ext_debug("out here\n"); >> - goto out; >> -insert: >> - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); >> - if (err == -ENOSPC && may_zeroout) { >> - err = ext4_ext_zeroout(inode, &orig_ex); >> - if (err) >> - goto fix_extent_len; >> - /* update the extent length and mark as initialized */ >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_dirty(handle, inode, path + depth); >> - /* zero out the first half */ >> - return allocated; >> - } else if (err) >> - goto fix_extent_len; >> -out: >> - ext4_ext_show_leaf(inode, path); >> - return err ? err : allocated; >> - >> -fix_extent_len: >> - ex->ee_block = orig_ex.ee_block; >> - ex->ee_len = orig_ex.ee_len; >> - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); >> - ext4_ext_mark_uninitialized(ex); >> - ext4_ext_dirty(handle, inode, path + depth); >> - return err; >> + flags |= EXT4_GET_BLOCKS_PRE_IO; >> + return ext4_split_extent(handle, inode, path, map, split_flag, flags); >> } >> >> static int ext4_convert_unwritten_extents_endio(handle_t *handle, > > > -- Best Wishes Yongqiang Yang -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html