2014-02-11 20:59 GMT+09:00, Lukáš Czerner <lczerner@xxxxxxxxxx>: > On Sun, 2 Feb 2014, Namjae Jeon wrote: > >> Date: Sun, 2 Feb 2014 14:44:34 +0900 >> From: Namjae Jeon <linkinjeon@xxxxxxxxx> >> To: viro@xxxxxxxxxxxxxxxxxx, david@xxxxxxxxxxxxx, bpm@xxxxxxx, >> tytso@xxxxxxx, >> adilger.kernel@xxxxxxxxx, jack@xxxxxxx, mtk.manpages@xxxxxxxxx >> Cc: linux-fsdevel@xxxxxxxxxxxxxxx, xfs@xxxxxxxxxxx, >> linux-ext4@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, >> Namjae Jeon <linkinjeon@xxxxxxxxx>, Namjae Jeon >> <namjae.jeon@xxxxxxxxxxx>, >> Ashish Sangwan <a.sangwan@xxxxxxxxxxx> >> Subject: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE >> for >> fallocate >> >> From: Namjae Jeon <namjae.jeon@xxxxxxxxxxx> >> >> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate Hi Lukas. > > Description is missing here, please provide FALLOC_FL_COLLAPSE_RANGE > description so people know what it's supposed to be doing. > > more comments bellow. Okay, I will udpate. > > Thanks! > -Lukas > >> >> Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx> >> Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx> >> --- >> fs/ext4/ext4.h | 3 + >> fs/ext4/extents.c | 297 >> ++++++++++++++++++++++++++++++++++++++++++- >> fs/ext4/move_extent.c | 2 +- >> include/trace/events/ext4.h | 25 ++++ >> 4 files changed, 325 insertions(+), 2 deletions(-) >> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index e618503..5cc015a 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -2745,6 +2745,7 @@ extern int ext4_find_delalloc_cluster(struct inode >> *inode, ext4_lblk_t lblk); >> extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info >> *fieinfo, >> __u64 start, __u64 len); >> extern int ext4_ext_precache(struct inode *inode); >> +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t >> len); >> >> /* move_extent.c */ >> extern void ext4_double_down_write_data_sem(struct inode *first, >> @@ -2754,6 +2755,8 @@ extern void ext4_double_up_write_data_sem(struct >> inode *orig_inode, >> extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, >> __u64 start_orig, __u64 start_donor, >> __u64 len, __u64 *moved_len); >> +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path >> *path, >> + struct ext4_extent **extent); >> >> /* page-io.c */ >> extern int __init ext4_init_pageio(void); >> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c >> index 267c9fb..12338c1 100644 >> --- a/fs/ext4/extents.c >> +++ b/fs/ext4/extents.c >> @@ -4566,12 +4566,16 @@ long ext4_fallocate(struct file *file, int mode, >> loff_t offset, loff_t len) >> unsigned int credits, blkbits = inode->i_blkbits; >> >> /* Return error if mode is not supported */ >> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) >> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | >> + FALLOC_FL_COLLAPSE_RANGE)) >> return -EOPNOTSUPP; >> >> if (mode & FALLOC_FL_PUNCH_HOLE) >> return ext4_punch_hole(inode, offset, len); >> >> + if (mode & FALLOC_FL_COLLAPSE_RANGE) >> + return ext4_collapse_range(inode, offset, len); >> + >> ret = ext4_convert_inline_data(inode); >> if (ret) >> return ret; >> @@ -4870,3 +4874,294 @@ int ext4_fiemap(struct inode *inode, struct >> fiemap_extent_info *fieinfo, >> ext4_es_lru_add(inode); >> return error; >> } >> + >> +/* >> + * ext4_access_path: >> + * Function to access the path buffer for marking it dirty. >> + * It also checks if there are sufficient credits left in the journal >> handle >> + * to update path. >> + */ >> +static int >> +ext4_access_path(handle_t *handle, struct inode *inode, >> + struct ext4_ext_path *path) >> +{ >> + int credits, err; >> + >> + /* >> + * Check if need to extend journal credits >> + * 3 for leaf, sb, and inode plus 2 (bmap and group >> + * descriptor) for each block group; assume two block >> + * groups >> + */ >> + if (handle->h_buffer_credits < 7) { >> + credits = ext4_writepage_trans_blocks(inode); >> + err = ext4_ext_truncate_extend_restart(handle, inode, credits); >> + /* EAGAIN is success */ >> + if (err && err != -EAGAIN) >> + return err; >> + } >> + >> + err = ext4_ext_get_access(handle, inode, path); >> + return err; >> +} >> + >> +/* >> + * ext4_ext_shift_path_extents: >> + * Shift the extents of a path structure lying between path[depth].p_ext >> + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting >> shift >> + * from starting block for each extent. >> + */ >> +static int >> +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t >> shift, >> + struct inode *inode, handle_t *handle, >> + ext4_lblk_t *start) >> +{ >> + int depth, err = 0; >> + struct ext4_extent *ex_start, *ex_last; >> + bool update = 0; >> + depth = path->p_depth; >> + >> + while (depth >= 0) { >> + if (depth == path->p_depth) { >> + ex_start = path[depth].p_ext; >> + if (!ex_start) >> + return -EIO; >> + >> + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); >> + if (!ex_last) >> + return -EIO; >> + >> + err = ext4_access_path(handle, inode, path + depth); >> + if (err) >> + goto out; >> + >> + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) >> + update = 1; >> + >> + *start = ex_last->ee_block + >> + ext4_ext_get_actual_len(ex_last); >> + >> + while (ex_start <= ex_last) { >> + ex_start->ee_block -= shift; >> + if (ex_start > >> + EXT_FIRST_EXTENT(path[depth].p_hdr)) { >> + if (ext4_ext_try_to_merge_right(inode, >> + path, ex_start - 1)) >> + ex_last--; >> + } >> + ex_start++; >> + } >> + err = ext4_ext_dirty(handle, inode, path + depth); >> + if (err) >> + goto out; >> + >> + if (--depth < 0 || !update) >> + break; >> + } >> + >> + /* Update index too */ >> + err = ext4_access_path(handle, inode, path + depth); >> + if (err) >> + goto out; >> + >> + path[depth].p_idx->ei_block -= shift; >> + err = ext4_ext_dirty(handle, inode, path + depth); >> + if (err) >> + goto out; >> + >> + /* we are done if current index is not a starting index */ >> + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) >> + break; >> + >> + depth--; >> + } >> + >> +out: >> + return err; >> +} >> + >> +/* >> + * ext4_ext_shift_extents: >> + * All the extents which lies in the range from start to the last >> allocated >> + * block for the file are shifted downwards by shift blocks. >> + * On success, 0 is returned, error otherwise. >> + */ >> +static int >> +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, >> + ext4_lblk_t start, ext4_lblk_t shift) >> +{ >> + struct ext4_ext_path *path; >> + int ret = 0, depth; >> + struct ext4_extent *extent; >> + ext4_lblk_t stop_block, current_block; >> + ext4_lblk_t ex_start, ex_end; >> + >> + /* Let path point to the last extent */ >> + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); >> + if (IS_ERR(path)) >> + return PTR_ERR(path); >> + >> + depth = path->p_depth; >> + extent = path[depth].p_ext; >> + if (!extent) { >> + ext4_ext_drop_refs(path); >> + kfree(path); >> + return ret; >> + } >> + >> + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); >> + ext4_ext_drop_refs(path); >> + kfree(path); >> + >> + /* Nothing to shift, if hole is at the end of file */ >> + if (start >= stop_block) >> + return ret; >> + >> + /* >> + * Don't start shifting extents until we make sure the hole is big >> + * enough to accomodate the shift. >> + */ >> + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); >> + depth = path->p_depth; >> + extent = path[depth].p_ext; >> + ex_start = extent->ee_block; >> + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); >> + ext4_ext_drop_refs(path); >> + kfree(path); >> + >> + if ((ex_start > start - 1 && shift > ex_start) || >> + (ex_end > start - shift)) >> + return -EINVAL; >> + >> + /* Its safe to start updating extents */ >> + while (start < stop_block) { >> + path = ext4_ext_find_extent(inode, start, NULL, 0); >> + if (IS_ERR(path)) >> + return PTR_ERR(path); >> + depth = path->p_depth; >> + extent = path[depth].p_ext; >> + current_block = extent->ee_block; >> + if (start > current_block) { >> + /* Hole, move to the next extent */ >> + ret = mext_next_extent(inode, path, &extent); >> + if (ret != 0) { >> + ext4_ext_drop_refs(path); >> + kfree(path); >> + if (ret == 1) >> + ret = 0; >> + break; >> + } >> + } >> + ret = ext4_ext_shift_path_extents(path, shift, inode, >> + handle, &start); >> + ext4_ext_drop_refs(path); >> + kfree(path); >> + if (ret) >> + break; >> + } >> + >> + return ret; >> +} >> + >> +/* >> + * ext4_collapse_range: >> + * This implements the fallocate's collapse range functionality for ext4 >> + * Returns: 0 and non-zero on error. >> + */ >> +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> +{ >> + struct super_block *sb = inode->i_sb; >> + ext4_lblk_t punch_start, punch_stop; >> + handle_t *handle; >> + unsigned int credits; >> + unsigned int rounding; >> + loff_t ioffset, new_size; >> + int ret; >> + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; >> + >> + BUG_ON(offset + len > i_size_read(inode)); > > So if anyone provides offset + len which exceeds the i_size then it > crashes the kernel ? That does not sound right, or am I missing a > check anywhere ? > > Also in the patch 0/3 you're saying that: > > " It wokrs beyond "EOF", so the extents which are pre-allocated > beyond "EOF" are also updated. " > > so which is true ? Again it would be better to have the description > in this patch as well. You might misunderstand by reading old patch-set. please look at this one. https://lkml.org/lkml/2014/2/2/12 Since then, it has been decided to limit collapse range within file size and there is check in VFS patch for this condition. If user wants to collapse a range that is overlapping with EOF, truncate(2) is better suited. > > Moreover offset and len are loff_t which means that the addition > operation can easily overflow. Okay, I will check. > > Also you're not holding any locks which would prevent i_size from > changing. Okay. > > >> + >> + /* Collapse range works only on fs block size aligned offsets. */ >> + if (offset & blksize_mask || len & blksize_mask) >> + return -EINVAL; >> + >> + if (!S_ISREG(inode->i_mode)) >> + return -EOPNOTSUPP; >> + >> + if (EXT4_SB(sb)->s_cluster_ratio > 1) >> + return -EOPNOTSUPP; > > Please if you're going to write the support for it, make it > complete and provide support for bigalloc file system as well. > > What is the problem when it comes to bigalloc fs ? It should > concern allocation only, extent tree manipulation should be the > same. Acutally, I didn't consider bigalloc feature on collpase range because when I implmemented collapse range, punch hole didn't provide bigalloc support. As you said, bigalloc is only related with allocation, so I will remove this check. There has not been much change in ext4 patch since it was posted first time due to lack of review. > >> + >> + /* Currently just for extent based files */ >> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) >> + return -EOPNOTSUPP; > > That's fine indirect block addressing is pretty much obsolete now. > Ext3 uses it, but we do not need to add functionality to "ext3" > code. > > However the inode flag can change so you should do this under the > mutex lock. Okay. > >> + >> + if (IS_SWAPFILE(inode)) >> + return -ETXTBSY; > > Again, this should be done under the lock. Right. > >> + >> + trace_ext4_collapse_range(inode, offset, len); >> + >> + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); >> + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); > > So far you've been using blksize_mask instead of > EXT4_BLOCK_SIZE_BITS(sb), please use only one to make it easier for > reader. I suggest using EXT4_BLOCK_SIZE_BITS(sb) since you actually > have sb available. Okay, I will use EXT4_BLOCK_SIZE_BITS(sb). > >> + >> + rounding = max_t(uint, 1 << EXT4_BLOCK_SIZE_BITS(sb), PAGE_CACHE_SIZE); >> + ioffset = offset & ~(rounding - 1); > > Why do you need to round it to the whole page ? We don't actually need to round it to page sized boundary but we are using truncate_pagecache_range to truncate pages from offset till EOF. All other places where truncate_pagecache_range is used in kernel, do this sort of rounding, so we just follow the norm. Currently it seems un-necessary, I will remove it. > >> + >> + /* Write out all dirty pages */ >> + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -1); >> + if (ret) >> + return ret; >> + >> + /* Take mutex lock */ >> + mutex_lock(&inode->i_mutex); > > What about append only and immutable files ? You probably need to > check for that we well right ? See ext4_punch_hole() Okay, I will check it. > >> + >> + /* Wait for existing dio to complete */ >> + ext4_inode_block_unlocked_dio(inode); >> + inode_dio_wait(inode); >> + >> + truncate_pagecache_range(inode, ioffset, -1); >> + >> + credits = ext4_writepage_trans_blocks(inode); >> + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); >> + if (IS_ERR(handle)) { >> + ret = PTR_ERR(handle); >> + goto out_dio; >> + } >> + >> + down_write(&EXT4_I(inode)->i_data_sem); >> + >> + ext4_discard_preallocations(inode); >> + >> + ret = ext4_es_remove_extent(inode, punch_start, >> + EXT_MAX_BLOCKS - punch_start - 1); >> + if (ret) >> + goto journal_stop; >> + >> + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); >> + if (ret) >> + goto journal_stop; >> + >> + ret = ext4_ext_shift_extents(inode, handle, punch_stop, >> + punch_stop - punch_start); >> + if (ret) >> + goto journal_stop; >> + >> + if ((offset + len) > i_size_read(inode)) >> + new_size = offset; > > You've hit BUG_ON() on this case at the beginning of the function. I > am confused, please provide proper commit description. Yes, Right. this condition is obsolete as collapse range semantics have been changed since this condition was added. I will remove this one. Thanks for your review! > > Thanks! > -Lukas > >> + else >> + new_size = i_size_read(inode) - len; >> + >> + truncate_setsize(inode, new_size); >> + EXT4_I(inode)->i_disksize = new_size; >> + >> + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); >> + ext4_mark_inode_dirty(handle, inode); >> + >> +journal_stop: >> + ext4_journal_stop(handle); >> + up_write(&EXT4_I(inode)->i_data_sem); >> + >> +out_dio: >> + ext4_inode_resume_unlocked_dio(inode); >> + mutex_unlock(&inode->i_mutex); >> + return ret; >> +} >> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c >> index 773b503..b474558 100644 >> --- a/fs/ext4/move_extent.c >> +++ b/fs/ext4/move_extent.c >> @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct >> ext4_extent *dest) >> * ext4_ext_path structure refers to the last extent, or a negative >> error >> * value on failure. >> */ >> -static int >> +int >> mext_next_extent(struct inode *inode, struct ext4_ext_path *path, >> struct ext4_extent **extent) >> { >> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h >> index 197d312..90e2f71 100644 >> --- a/include/trace/events/ext4.h >> +++ b/include/trace/events/ext4.h >> @@ -2410,6 +2410,31 @@ TRACE_EVENT(ext4_es_shrink_exit, >> __entry->shrunk_nr, __entry->cache_cnt) >> ); >> >> +TRACE_EVENT(ext4_collapse_range, >> + TP_PROTO(struct inode *inode, loff_t offset, loff_t len), >> + >> + TP_ARGS(inode, offset, len), >> + >> + TP_STRUCT__entry( >> + __field(dev_t, dev) >> + __field(ino_t, ino) >> + __field(loff_t, offset) >> + __field(loff_t, len) >> + ), >> + >> + TP_fast_assign( >> + __entry->dev = inode->i_sb->s_dev; >> + __entry->ino = inode->i_ino; >> + __entry->offset = offset; >> + __entry->len = len; >> + ), >> + >> + TP_printk("dev %d,%d ino %lu offset %lld len %lld", >> + MAJOR(__entry->dev), MINOR(__entry->dev), >> + (unsigned long) __entry->ino, >> + __entry->offset, __entry->len) >> +); >> + >> #endif /* _TRACE_EXT4_H */ >> >> /* This part must be outside protection */ >> > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html