EXT4_IOC_CONTROL_PA allows to create new inode PA based on the specified range, or to discard all inode PAs in the target inode. INTERFACE ----------- #define EXT4_IOC_CONTROL_PA _IOWR('f', 16, struct ext4_prealloc_info) struct ext4_prealloc_info { __u64 pi_pstart; /* physical offset for the start of the PA from * the beginning of the file (in/out) */ __u32 pi_lstart; /* logical offset for the start of the PA from * the beginning of the disk (in/out) */ __u32 pi_len; /* length for this PA (in/out) */ __u32 pi_free; /* the number of free blocks in this PA (out) */ __u16 pi_flags; /* flags for the inode PA setting ioctl (in) */ }; This ioctl has three modes by setting the following flags to pi_flags: - EXT4_MB_MANDATORY Create the specified inode PA to the target inode. If the block allocation cannot allocate the specified range, this ioctl returns -ENOSPC. - EXT4_MB_ADVISORY Create the specified inode PA to the target inode. If the block allocation cannot allocate the specified range, this ioctl creates an inode PA allocated in the different range. - EXT4_MB_DISCARD_PA Discard all inode PAs in the target inode. This ioctl returns struct ext4_prealloc_info. pi_pstart shows where to create the inode PA, and pi_len shows how many blocks it allocates. If we call this ioctl with EXT4_MB_MANDATORY flag and the specified range cannot be allocated, this ioctl returns struct ext4_prealloc_info to give free space information near the specified range. NOTE ------ This ioctl is limited to create up to 1024 inode PAs. Because I'm afraid to increase memory usage if there is no limit. #define EXT4_MAX_PREALLOC 1024 Signed-off-by: Kazuya Mio <k-mio@xxxxxxxxxxxxx> Signed-off-by: Akira Fujita <a-fujita@xxxxxxxxxxxxx> --- fs/ext4/ext4.h | 21 ++++ fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 31 +++++ fs/ext4/mballoc.c | 342 ++++++++++++++++++++++++++++++++++++++++++++++++++--- fs/ext4/mballoc.h | 8 ++ 5 files changed, 386 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index edce3d9..b6be407 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -102,6 +102,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 +/* create an inode PA that has only free blocks */ +#define EXT4_MB_HINT_PA_ONLY 0x1000 struct ext4_allocation_request { @@ -125,6 +127,20 @@ struct ext4_allocation_request { unsigned int flags; }; +/* The maximum number of inode PAs that EXT4_IOC_CONTROL_PA can create */ +#define EXT4_MAX_PREALLOC 1024 + +/* inode PA information */ +struct ext4_prealloc_info { + __u64 pi_pstart; /* physical offset for the start of the PA from + * the beginning of the file (in/out) */ + __u32 pi_lstart; /* logical offset for the start of the PA from + * the beginning of the disk (in/out) */ + __u32 pi_len; /* length for this PA (in/out) */ + __u32 pi_free; /* the number of free blocks in this PA (out) */ + __u16 pi_flags; /* flags for the inode PA setting ioctl (in) */ +}; + /* * For delayed allocation tracking */ @@ -397,6 +413,7 @@ struct ext4_new_group_data { /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_CONTROL_PA _IOWR('f', 16, struct ext4_prealloc_info) /* * ioctl commands in 32 bit emulation @@ -1442,6 +1459,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb, extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); +extern int ext4_mb_control_pa(struct inode *inode, + struct ext4_prealloc_info *pi); /* inode.c */ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); @@ -1477,6 +1496,8 @@ extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_ind_get_blocks(handle_t *, struct inode *, ext4_lblk_t, + unsigned int, struct buffer_head *, int); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 567c9fc..8f34155 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -916,7 +916,7 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ -static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, +int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, int flags) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 11c1484..ed68a83 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -395,6 +395,35 @@ mext_out: return 0; } + case EXT4_IOC_CONTROL_PA: + { + struct ext4_prealloc_info pi; + int err; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&pi, + (struct ext4_prealloc_info __user *)arg, sizeof(pi))) + return -EFAULT; + + err = ext4_mb_control_pa(inode, &pi); + + /* + * If ext4_mb_control_pa() returns ENOSPC, we need the + * free space information of buddy bitmap to retry. + */ + if (err && err != -ENOSPC) + return err; + + if (copy_to_user((struct ext4_prealloc_info __user *)arg, + &pi, sizeof(pi))) + return -EFAULT; + return err; + } + default: return -ENOTTY; @@ -443,6 +472,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case EXT4_IOC_MOVE_EXT: break; + case EXT4_IOC_CONTROL_PA: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a291cc3..e242a82 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -22,6 +22,7 @@ */ #include "mballoc.h" +#include "ext4_extents.h" #include <linux/debugfs.h> #include <trace/events/ext4.h> @@ -3328,8 +3329,13 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) struct ext4_group_info *grp; struct ext4_inode_info *ei; - /* preallocate only when found space is larger then requested */ - BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + if (ac->ac_flags & EXT4_MB_HINT_PA_ONLY) { + /* EXT4_MB_HINT_PA_ONLY makes all found space preallocated */ + BUG_ON(ac->ac_b_ex.fe_len <= 0); + } else { + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + } BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); @@ -3337,7 +3343,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) if (pa == NULL) return -ENOMEM; - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + if (!(ac->ac_flags & EXT4_MB_HINT_PA_ONLY) && + ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; int wins; int win; @@ -3377,7 +3384,12 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); + if (!(ac->ac_flags & EXT4_MB_HINT_PA_ONLY)) { + atomic_set(&pa->pa_count, 1); + } else { + /* EXT4_MB_HINT_PA_ONLY doesn't allocate blocks */ + atomic_set(&pa->pa_count, 0); + } spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); @@ -3388,7 +3400,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); - ext4_mb_use_inode_pa(ac, pa); + if (!(ac->ac_flags & EXT4_MB_HINT_PA_ONLY)) + ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); @@ -4210,7 +4223,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { - int freed; + int freed, length_check; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; @@ -4236,6 +4249,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + /* + * In this case, we want a PA that is equal to ar->len. + * So don't adjust ar->len + */ + if ((ar->flags & EXT4_MB_HINT_PA_ONLY) && + (ar->flags & EXT4_MB_HINT_GOAL_ONLY)) { + ar->len = 0; + break; + } /* let others to free the space */ yield(); ar->len = ar->len >> 1; @@ -4246,6 +4268,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } reserv_blks = ar->len; while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { + /* + * In this case, we want a PA that is equal to ar->len. + * So don't adjust ar->len + */ + if ((ar->flags & EXT4_MB_HINT_PA_ONLY) && + (ar->flags & EXT4_MB_HINT_GOAL_ONLY)) { + ar->len = 0; + break; + } ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } @@ -4277,15 +4308,26 @@ repeat: /* allocate space in core */ ext4_mb_regular_allocator(ac); - /* as we've just preallocated more space than - * user requested orinally, we store allocated - * space in a special descriptor */ - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + if (ac->ac_flags & EXT4_MB_HINT_PA_ONLY) { + /* + * In EXT4_MB_HINT_PA_ONLY case, all of the found + * blocks are new PA space. + */ + length_check = (ac->ac_b_ex.fe_len > 0); + } else { + /* as we've just preallocated more space than + * user requested orinally, we store allocated + * space in a special descriptor */ + length_check = + (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len); + } + if (ac->ac_status == AC_STATUS_FOUND && length_check) ext4_mb_new_preallocation(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + if (!(ac->ac_flags & EXT4_MB_HINT_PA_ONLY)) + *errp = ext4_mb_mark_diskspace_used(ac, handle, + reserv_blks); if (*errp == -EAGAIN) { /* * drop the reference that we took @@ -4307,9 +4349,12 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) - goto repeat; + if (!(ac->ac_flags & EXT4_MB_HINT_PA_ONLY)) { + freed = ext4_mb_discard_preallocations(sb, + ac->ac_o_ex.fe_len); + if (freed) + goto repeat; + } *errp = -ENOSPC; ac->ac_b_ex.fe_len = 0; ar->len = 0; @@ -4321,10 +4366,12 @@ repeat: out2: kmem_cache_free(ext4_ac_cachep, ac); out1: - if (inquota && ar->len < inquota) + if (inquota && (ar->flags & EXT4_MB_HINT_PA_ONLY)) + dquot_free_block(ar->inode, inquota); + else if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); out3: - if (!ar->len) { + if (!ar->len || (ar->flags & EXT4_MB_HINT_PA_ONLY)) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, @@ -4643,3 +4690,264 @@ error_return: kmem_cache_free(ext4_ac_cachep, ac); return; } + +/** + * ext4_mb_check_offset_overlap - Check file offset overlap + * + * @inode: target inode + * @lstart: start file offset + * @len: the number of blocks to check overlap + * + * This function checks whether specified file offset has + * allocated/pre-allocated blocks or not. Return 0 if there is no allocated + * block in the range, return -EINVAL if there are some allocated blocks, + * or an error code on failure. + */ +static int ext4_mb_check_offset_overlap(struct inode *inode, + ext4_lblk_t lstart, unsigned int len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_prealloc_space *pa; + struct buffer_head dummy_bh; + int retval = 0; + + /* check allocated blocks */ + if (ei->i_flags & EXT4_EXTENTS_FL) { + retval = ext4_ext_get_blocks(NULL, inode, lstart, + 1, &dummy_bh, 0); + if (retval) { + goto out; + } else { + struct ext4_ext_path *path = NULL; + struct ext4_extent newex; + + path = ext4_ext_find_extent(inode, lstart, path); + if (IS_ERR(path)) { + retval = PTR_ERR(path); + goto out; + } + + newex.ee_block = cpu_to_le32(lstart); + newex.ee_len = cpu_to_le16(len); + retval = ext4_ext_check_overlap(inode, &newex, path); + + ext4_ext_drop_refs(path); + kfree(path); + + if (retval) + goto out; + } + } else { + ext4_lblk_t check_off = lstart; + /* with non-extent format, check one block at a time */ + while (check_off < lstart + len) { + retval = ext4_ind_get_blocks(NULL, inode, check_off++, + 1, &dummy_bh, 0); + if (retval) + goto out; + } + } + + /* check pre-allocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + if (!(pa->pa_lstart + pa->pa_len <= lstart || + pa->pa_lstart >= lstart + len)) { + retval = 1; + break; + } + } + rcu_read_unlock(); + +out: + if (retval > 0) { + ext4_debug("Offset %u is already allocated block [inode %lu]\n", + lstart, inode->i_ino); + retval = -EINVAL; + } + + return retval; +} + +/** + * ext4_mb_control_pa - Set the inode PA or discard all inode PA + * + * @inode: target inode + * @pi: the information of the inode PA and flags + * + * Return 0 if successful or a negative value on failure. + */ +int ext4_mb_control_pa(struct inode *inode, struct ext4_prealloc_info *pi) +{ + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_allocation_request ar; + struct ext4_prealloc_space *tmp; + ext4_group_t group, end_group; + ext4_fsblk_t blocks = 0; + ext4_grpblk_t pa_off, end_off; + handle_t *handle; + loff_t maxbytes; + int flag_count; + int bsbits = EXT4_BLOCK_SIZE_BITS(sb); + int pa_count = 0; + int i, err1; + + flag_count = ((pi->pi_flags & EXT4_MB_MANDATORY) > 0) + + ((pi->pi_flags & EXT4_MB_ADVISORY) > 0) + + ((pi->pi_flags & EXT4_MB_DISCARD_PA) > 0); + if (flag_count != 1) { + ext4_debug("pi_flags should be set only one flag " + "[inode %lu]\n", inode->i_ino); + return -EINVAL; + } + + if (pi->pi_flags & EXT4_MB_DISCARD_PA) { + down_write(&ei->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&ei->i_data_sem); + return 0; + } + + /* calculate block group number of PA start and end block */ + ext4_get_group_no_and_offset(sb, pi->pi_pstart, &group, &pa_off); + ext4_get_group_no_and_offset(sb, pi->pi_pstart + pi->pi_len - 1, + &end_group, &end_off); + + /* Get maximum file size */ + maxbytes = (ei->i_flags & EXT4_EXTENTS_FL) ? + sb->s_maxbytes : EXT4_SB(sb)->s_bitmap_maxbytes; + + /* The "-10" takes into account some metadata blocks */ + if (pi->pi_len > EXT4_BLOCKS_PER_GROUP(sb) - 10) { + ext4_debug("pi_len should not be more than %lu [inode %lu]\n", + EXT4_BLOCKS_PER_GROUP(sb) - 10, inode->i_ino); + return -EINVAL; + } else if (group != end_group) { + ext4_debug("pi_len steps over BG boundary (%u blocks over) " + "[inode %lu]\n", end_off + 1, inode->i_ino); + return -EINVAL; + } else if (!pi->pi_len) { + ext4_debug("pi_len cannot be set 0 because the length of PA" + "is more than one block [inode %lu]\n", + inode->i_ino); + return -EINVAL; + } else if (pi->pi_pstart + pi->pi_len > + ext4_blocks_count(EXT4_SB(sb)->s_es)) { + ext4_debug("pi_pstart is too big: pi_pstart = %llu pi_len = %u" + " [inode %lu]\n", pi->pi_pstart, + pi->pi_len, inode->i_ino); + return -EFBIG; + } else if (((loff_t)pi->pi_lstart + pi->pi_len) << bsbits + > maxbytes) { + ext4_debug("pi_lstart is too big: pi_lstart = %u pi_len = %u" + " [inode %lu]\n", pi->pi_lstart, + pi->pi_len, inode->i_ino); + return -EINVAL; + } + + /* + * We get journal handle before locking i_data_sem for + * ext4_dirty_inode() called in ext4_mb_new_blocsk(). + */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&ei->i_data_sem); + + err1 = ext4_mb_check_offset_overlap(inode, pi->pi_lstart, pi->pi_len); + if (err1) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &ei->i_prealloc_list, pa_inode_list) + pa_count++; + rcu_read_unlock(); + + /* + * If inode PA is already created more than EXT4_MAX_PREALLOC, + * EXT4_IOC_CONTROL_PA cannot create any inode PAs. + */ + if (pa_count >= EXT4_MAX_PREALLOC) { + err1 = -EBUSY; + goto out; + } + + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = pi->pi_len; + ar.logical = pi->pi_lstart; + ar.goal = pi->pi_pstart; + + ar.flags = EXT4_MB_HINT_PA_ONLY | EXT4_MB_HINT_TRY_GOAL; + if (pi->pi_flags & EXT4_MB_MANDATORY) + ar.flags |= EXT4_MB_HINT_GOAL_ONLY; + + /* create inode PA */ + blocks = ext4_mb_new_blocks(handle, &ar, &err1); + +out: + up_write(&ei->i_data_sem); + ext4_journal_stop(handle); + pi->pi_len = 0; + + if (!err1) { + /* success creating inode PA */ + pi->pi_pstart = blocks; + pi->pi_len = ar.len; + return 0; + } + + if (err1 == -ENOSPC && (pi->pi_flags & EXT4_MB_MANDATORY)) { + struct ext4_buddy e4b; + ext4_group_t ngroups; + ext4_grpblk_t off; + int max, bit, err2; + void *buddy; + + /* search free space and return its information to user space */ + ngroups = ext4_get_groups_count(sb); + if (!(ei->i_flags & EXT4_EXTENTS_FL)) + ngroups = EXT4_SB(sb)->s_blockfile_groups; + + for (i = 0, off = pa_off; i <= ngroups; i++, group++) { + if (group == ngroups) + group = 0; + + err2 = ext4_mb_load_buddy(sb, group, &e4b); + if (err2) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + continue; + } + ext4_lock_group(sb, group); + buddy = mb_find_buddy(&e4b, 0, &max); + BUG_ON(buddy == NULL); + + /* + * If check BG of the inode PA again, set maximum + * length to avoid double checking of the first check + */ + if (i == ngroups) + max = pa_off; + bit = mb_find_next_zero_bit(buddy, max, off); + if (bit < max) { + pi->pi_pstart = ext4_group_first_block_no(sb, + group) + bit; + pi->pi_len = mb_find_next_bit(buddy, + max, bit) - bit; + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + break; + } + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + off = 0; + } + } + + return err1; +} + diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b619322..4d040a1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -225,4 +225,12 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, { return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; } + +/* + * EXT4_IOC_CONTROL_PA flags to specify its behavior + */ +#define EXT4_MB_MANDATORY 0x0001 +#define EXT4_MB_ADVISORY 0x0002 +#define EXT4_MB_DISCARD_PA 0x0004 + #endif -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html