Hi Allison, I just want to let you know that I have rebased your patches to allow punching hole past i_size, but I am going to send it as a separate series hopefully this week. I need to solve the problem with cleaning EOFBLOCKS_FL flag properly first. So just you know that you do not need to worry about those. Thanks! -Lukas On Mon, 19 Mar 2012, Lukas Czerner wrote: > This commit rewrites ext4 punch hole implementation to use > ext4_ext_remove_space() instead of its home gown way of doing this via > ext4_ext_map_blocks(). There are several reasons for changing this. > > Firstly it is quite non obvious that punching hole needs to > ext4_ext_map_blocks() to punch a hole, especially given that this > function should map blocks, not unmap it. It also required a lot of new > code in ext4_ext_map_blocks(). > > Secondly the design of it is not very effective. The reason is that we > are trying to punch out blocks in ext4_ext_punch_hole() in opposite > direction than in ext4_ext_rm_leaf() which causes the ext4_ext_rm_leaf() > to iterate through the whole tree from the end to the start to find the > requested extent for every extent we are going to punch out. > > And finally the current implementation does not use the existing code, > but bring a lot of new code, which is IMO unnecessary since there > already is some infrastructure we can use. Specifically > ext4_ext_remove_space(). > > This commit changes ext4_ext_remove_space() to accept 'end' parameter so > we can not only truncate to the end of file, but also remove the space > in the middle of the file (punch a hole). Moreover, because the last > block to punch out, might be in the middle of the extent, we have to > split the extent at 'end + 1' so ext4_ext_rm_leaf() can easily either > remove the whole fist part of split extent, or change its size. > > ext4_ext_remove_space() is then used to actually remove the space > (extents) from within the hole, instead of ext4_ext_map_blocks(). > > Note that this also fix the issue with punch hole, where we would forget > to remove empty index blocks from the extent tree, resulting in double > free block error and file system corruption. This is simply because we > now use different code path, where this problem does not exist. > > This has been tested with fsx running for several days and xfstests, > plus xfstest #251 with '-o discard' run on the loop image (which > converts discard requestes into punch hole to the backing file). All of > it on 1K and 4K file system block size. > > Signed-off-by: Lukas Czerner <lczerner@xxxxxxxxxx> > --- > v2: Change condition to distinct punch_hole from truncate in > ext4_ext_remove_space() > > fs/ext4/extents.c | 170 +++++++++++++++++++++++++++------------------------- > 1 files changed, 88 insertions(+), 82 deletions(-) > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 74f23c2..522f429 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -44,6 +44,14 @@ > > #include <trace/events/ext4.h> > > +/* > + * used by extent splitting. > + */ > +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ > + due to ENOSPC */ > +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ > +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ > + > static int ext4_split_extent(handle_t *handle, > struct inode *inode, > struct ext4_ext_path *path, > @@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle, > int split_flag, > int flags); > > +static int ext4_split_extent_at(handle_t *handle, > + struct inode *inode, > + struct ext4_ext_path *path, > + ext4_lblk_t split, > + int split_flag, > + int flags); > + > static int ext4_ext_truncate_extend_restart(handle_t *handle, > struct inode *inode, > int needed) > @@ -2308,7 +2323,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, > struct ext4_extent *ex; > > /* the header must be checked already in ext4_ext_remove_space() */ > - ext_debug("truncate since %u in leaf\n", start); > + ext_debug("truncate since %u in leaf to %u\n", start, end); > if (!path[depth].p_hdr) > path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); > eh = path[depth].p_hdr; > @@ -2343,7 +2358,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, > ext_debug(" border %u:%u\n", a, b); > > /* If this extent is beyond the end of the hole, skip it */ > - if (end <= ex_ee_block) { > + if (end < ex_ee_block) { > ex--; > ex_ee_block = le32_to_cpu(ex->ee_block); > ex_ee_len = ext4_ext_get_actual_len(ex); > @@ -2482,7 +2497,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) > return 1; > } > > -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, > + ext4_lblk_t end) > { > struct super_block *sb = inode->i_sb; > int depth = ext_depth(inode); > @@ -2491,7 +2507,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) > handle_t *handle; > int i, err; > > - ext_debug("truncate since %u\n", start); > + ext_debug("truncate since %u to %u\n", start, end); > > /* probably first extent we're gonna free will be last in block */ > handle = ext4_journal_start(inode, depth + 1); > @@ -2504,6 +2520,61 @@ again: > trace_ext4_ext_remove_space(inode, start, depth); > > /* > + * Check if we are removing extents inside the extent tree. If that > + * is the case, we are going to punch a hole inside the extent tree > + * so we have to check whether we need to split the extent covering > + * the last block to remove so we can easily remove the part of it > + * in ext4_ext_rm_leaf(). > + */ > + if (end < EXT_MAX_BLOCKS - 1) { > + struct ext4_extent *ex; > + ext4_lblk_t ee_block; > + > + /* find extent for this block */ > + path = ext4_ext_find_extent(inode, end, NULL); > + if (IS_ERR(path)) { > + ext4_journal_stop(handle); > + return PTR_ERR(path); > + } > + depth = ext_depth(inode); > + ex = path[depth].p_ext; > + if (!ex) > + goto cont; > + > + ee_block = le32_to_cpu(ex->ee_block); > + > + /* > + * See if the last block is inside the extent, if so split > + * the extent at 'end' block so we can easily remove the > + * tail of the first part of the split extent in > + * ext4_ext_rm_leaf(). > + */ > + if (end >= ee_block && > + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { > + int split_flag = 0; > + > + if (ext4_ext_is_uninitialized(ex)) > + split_flag = EXT4_EXT_MARK_UNINIT1 | > + EXT4_EXT_MARK_UNINIT2; > + > + /* > + * Split the extent in two so that 'end' is the last > + * block in the first new extent > + */ > + err = ext4_split_extent_at(handle, inode, path, > + end + 1, split_flag, > + EXT4_GET_BLOCKS_PRE_IO | > + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); > + > + if (err < 0) > + goto out; > + } > + ext4_ext_drop_refs(path); > + kfree(path); > + } > +cont: > + > + /* > * We start scanning from right side, freeing all the blocks > * after i_size and walking into the tree depth-wise. > */ > @@ -2515,6 +2586,7 @@ again: > } > path[0].p_depth = depth; > path[0].p_hdr = ext_inode_hdr(inode); > + > if (ext4_ext_check(inode, path[0].p_hdr, depth)) { > err = -EIO; > goto out; > @@ -2526,7 +2598,7 @@ again: > /* this is leaf block */ > err = ext4_ext_rm_leaf(handle, inode, path, > &partial_cluster, start, > - EXT_MAX_BLOCKS - 1); > + end); > /* root level has p_bh == NULL, brelse() eats this */ > brelse(path[i].p_bh); > path[i].p_bh = NULL; > @@ -2709,14 +2781,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) > } > > /* > - * used by extent splitting. > - */ > -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ > - due to ENOSPC */ > -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ > -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ > - > -/* > * ext4_split_extent_at() splits an extent at given block. > * > * @handle: the journal handle > @@ -4228,7 +4292,7 @@ void ext4_ext_truncate(struct inode *inode) > > last_block = (inode->i_size + sb->s_blocksize - 1) > >> EXT4_BLOCK_SIZE_BITS(sb); > - err = ext4_ext_remove_space(inode, last_block); > + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); > > /* In a multi-transaction truncate, we only make the final > * transaction synchronous. > @@ -4705,14 +4769,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) > { > struct inode *inode = file->f_path.dentry->d_inode; > struct super_block *sb = inode->i_sb; > - struct ext4_ext_cache cache_ex; > - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; > + ext4_lblk_t first_block, stop_block; > struct address_space *mapping = inode->i_mapping; > - struct ext4_map_blocks map; > handle_t *handle; > loff_t first_page, last_page, page_len; > loff_t first_page_offset, last_page_offset; > - int ret, credits, blocks_released, err = 0; > + int credits, err = 0; > > /* No need to punch hole beyond i_size */ > if (offset >= inode->i_size) > @@ -4728,10 +4790,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) > offset; > } > > - first_block = (offset + sb->s_blocksize - 1) >> > - EXT4_BLOCK_SIZE_BITS(sb); > - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); > - > first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; > last_page = (offset + length) >> PAGE_CACHE_SHIFT; > > @@ -4810,7 +4868,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) > } > } > > - > /* > * If i_size is contained in the last page, we need to > * unmap and zero the partial page after i_size > @@ -4830,73 +4887,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) > } > } > > + first_block = (offset + sb->s_blocksize - 1) >> > + EXT4_BLOCK_SIZE_BITS(sb); > + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); > + > /* If there are no blocks to remove, return now */ > - if (first_block >= last_block) > + if (first_block >= stop_block) > goto out; > > down_write(&EXT4_I(inode)->i_data_sem); > ext4_ext_invalidate_cache(inode); > ext4_discard_preallocations(inode); > > - /* > - * Loop over all the blocks and identify blocks > - * that need to be punched out > - */ > - iblock = first_block; > - blocks_released = 0; > - while (iblock < last_block) { > - max_blocks = last_block - iblock; > - num_blocks = 1; > - memset(&map, 0, sizeof(map)); > - map.m_lblk = iblock; > - map.m_len = max_blocks; > - ret = ext4_ext_map_blocks(handle, inode, &map, > - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); > - > - if (ret > 0) { > - blocks_released += ret; > - num_blocks = ret; > - } else if (ret == 0) { > - /* > - * If map blocks could not find the block, > - * then it is in a hole. If the hole was > - * not already cached, then map blocks should > - * put it in the cache. So we can get the hole > - * out of the cache > - */ > - memset(&cache_ex, 0, sizeof(cache_ex)); > - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && > - !cache_ex.ec_start) { > + err = ext4_ext_remove_space(inode, first_block, stop_block - 1); > > - /* The hole is cached */ > - num_blocks = cache_ex.ec_block + > - cache_ex.ec_len - iblock; > - > - } else { > - /* The block could not be identified */ > - err = -EIO; > - break; > - } > - } else { > - /* Map blocks error */ > - err = ret; > - break; > - } > - > - if (num_blocks == 0) { > - /* This condition should never happen */ > - ext_debug("Block lookup failed"); > - err = -EIO; > - break; > - } > - > - iblock += num_blocks; > - } > - > - if (blocks_released > 0) { > - ext4_ext_invalidate_cache(inode); > - ext4_discard_preallocations(inode); > - } > + ext4_ext_invalidate_cache(inode); > + ext4_discard_preallocations(inode); > > if (IS_SYNC(inode)) > ext4_handle_sync(handle); > -- -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html