Ritesh Harjani wrote on 2020/8/18 11:09: > > > On 8/17/20 1:06 PM, brookxu wrote: >> In the scenario of writing sparse files, the per-inode prealloc list may >> be very long, resulting in high overhead for ext4_mb_use_preallocated(). >> To circumvent this problem, we limit the maximum length of per-inode >> prealloc list to 512 and allow users to modify it. >> >> After patching, we observed that the sys ratio of cpu has dropped, and >> the system throughput has increased significantly. We created a process >> to write the sparse file, and the running time of the process on the >> fixed kernel was significantly reduced, as follows: >> >> Running time on unfixed kernel: >> [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat >> real 0m2.051s >> user 0m0.008s >> sys 0m2.026s >> >> Running time on fixed kernel: >> [root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat >> real 0m0.471s >> user 0m0.004s >> sys 0m0.395s >> >> V5: >> 1. Fix the wrong parameter of list_move(). >> 2. Use an atomic variable to count the length of the prealloc list. >> >> V4: >> 1. Add performance data to the commit log. >> >> Signed-off-by: Chunguang Xu <brookxu@xxxxxxxxxxx> >> --- >> Documentation/admin-guide/ext4.rst | 3 ++ >> fs/ext4/ext4.h | 4 ++- >> fs/ext4/extents.c | 10 +++--- >> fs/ext4/file.c | 2 +- >> fs/ext4/indirect.c | 2 +- >> fs/ext4/inode.c | 6 ++-- >> fs/ext4/ioctl.c | 2 +- >> fs/ext4/mballoc.c | 74 +++++++++++++++++++++++++++++++++----- >> fs/ext4/mballoc.h | 4 +++ >> fs/ext4/move_extent.c | 4 +-- >> fs/ext4/super.c | 3 +- >> fs/ext4/sysfs.c | 2 ++ >> include/trace/events/ext4.h | 17 +++++---- >> 13 files changed, 104 insertions(+), 29 deletions(-) >> >> diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst >> index 9443fce..f37d074 100644 >> --- a/Documentation/admin-guide/ext4.rst >> +++ b/Documentation/admin-guide/ext4.rst >> @@ -482,6 +482,9 @@ Files in /sys/fs/ext4/<devname>: >> multiple of this tuning parameter if the stripe size is not set in the >> ext4 superblock >> + mb_max_inode_prealloc >> + The maximum length of per-inode ext4_prealloc_space list. >> + >> mb_max_to_scan >> The maximum number of extents the multiblock allocator will search to >> find the best extent. >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 42f5060..c4b465c 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -1054,6 +1054,7 @@ struct ext4_inode_info { >> struct timespec64 i_crtime; >> /* mballoc */ >> + atomic_t i_prealloc_active; >> struct list_head i_prealloc_list; >> spinlock_t i_prealloc_lock; >> @@ -1501,6 +1502,7 @@ struct ext4_sb_info { >> unsigned int s_mb_stats; >> unsigned int s_mb_order2_reqs; >> unsigned int s_mb_group_prealloc; >> + unsigned int s_mb_max_inode_prealloc; >> unsigned int s_max_dir_size_kb; >> /* where last allocation was done - for stream allocation */ >> unsigned long s_mb_last_group; >> @@ -2651,7 +2653,7 @@ extern int ext4_init_inode_table(struct super_block *sb, >> extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, >> struct ext4_allocation_request *, int *); >> extern int ext4_mb_reserve_blocks(struct super_block *, int); >> -extern void ext4_discard_preallocations(struct inode *); >> +extern void ext4_discard_preallocations(struct inode *, unsigned int); >> extern int __init ext4_init_mballoc(void); >> extern void ext4_exit_mballoc(void); >> extern void ext4_free_blocks(handle_t *handle, struct inode *inode, >> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c >> index 221f240..a40f928 100644 >> --- a/fs/ext4/extents.c >> +++ b/fs/ext4/extents.c >> @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) >> * i_mutex. So we can safely drop the i_data_sem here. >> */ >> BUG_ON(EXT4_JOURNAL(inode) == NULL); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> up_write(&EXT4_I(inode)->i_data_sem); >> *dropped = 1; >> return 0; >> @@ -4272,7 +4272,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, >> * not a good idea to call discard here directly, >> * but otherwise we'd need to call it every free(). >> */ >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) >> fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; >> ext4_free_blocks(handle, inode, NULL, newblock, >> @@ -5299,7 +5299,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> } >> down_write(&EXT4_I(inode)->i_data_sem); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> ret = ext4_es_remove_extent(inode, punch_start, >> EXT_MAX_BLOCKS - punch_start); >> @@ -5313,7 +5313,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) >> up_write(&EXT4_I(inode)->i_data_sem); >> goto out_stop; >> } >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> ret = ext4_ext_shift_extents(inode, handle, punch_stop, >> punch_stop - punch_start, SHIFT_LEFT); >> @@ -5445,7 +5445,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) >> goto out_stop; >> down_write(&EXT4_I(inode)->i_data_sem); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> path = ext4_find_extent(inode, offset_lblk, NULL, 0); >> if (IS_ERR(path)) { >> diff --git a/fs/ext4/file.c b/fs/ext4/file.c >> index 2a01e31..e3ab8ea 100644 >> --- a/fs/ext4/file.c >> +++ b/fs/ext4/file.c >> @@ -148,7 +148,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) >> !EXT4_I(inode)->i_reserved_data_blocks) >> { >> down_write(&EXT4_I(inode)->i_data_sem); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> up_write(&EXT4_I(inode)->i_data_sem); >> } >> if (is_dx(inode) && filp->private_data) >> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c >> index be2b66e..ec6b930 100644 >> --- a/fs/ext4/indirect.c >> +++ b/fs/ext4/indirect.c >> @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, >> * i_mutex. So we can safely drop the i_data_sem here. >> */ >> BUG_ON(EXT4_JOURNAL(inode) == NULL); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> up_write(&EXT4_I(inode)->i_data_sem); >> *dropped = 1; >> return 0; >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >> index 10dd470..bb9e1cd 100644 >> --- a/fs/ext4/inode.c >> +++ b/fs/ext4/inode.c >> @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode, >> */ >> if ((ei->i_reserved_data_blocks == 0) && >> !inode_is_open_for_write(inode)) >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> } >> static int __check_block_validity(struct inode *inode, const char *func, >> @@ -4056,7 +4056,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) >> if (stop_block > first_block) { >> down_write(&EXT4_I(inode)->i_data_sem); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> ret = ext4_es_remove_extent(inode, first_block, >> stop_block - first_block); >> @@ -4211,7 +4211,7 @@ int ext4_truncate(struct inode *inode) >> down_write(&EXT4_I(inode)->i_data_sem); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) >> err = ext4_ext_truncate(handle, inode); >> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c >> index 999cf6a..a5fcc23 100644 >> --- a/fs/ext4/ioctl.c >> +++ b/fs/ext4/ioctl.c >> @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb, >> reset_inode_seed(inode); >> reset_inode_seed(inode_bl); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> err = ext4_mark_inode_dirty(handle, inode); >> if (err < 0) { >> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c >> index 51f37f1..58426c6 100644 >> --- a/fs/ext4/mballoc.c >> +++ b/fs/ext4/mballoc.c >> @@ -2739,6 +2739,7 @@ int ext4_mb_init(struct super_block *sb) >> sbi->s_mb_stats = MB_DEFAULT_STATS; >> sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; >> sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; >> + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; >> /* >> * The default group preallocation is 512, which for 4k block >> * sizes translates to 2 megabytes. However for bigalloc file >> @@ -3677,6 +3678,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, >> mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); >> } >> +static void ext4_mb_mark_pa_deleted(struct super_block *sb, >> + struct ext4_prealloc_space *pa) >> +{ >> + struct ext4_inode_info *ei; >> + >> + if (pa->pa_deleted) { >> + ext4_warning(sb, "deaded pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", > > s/deaded/deleted ? Ritesh, thanks for taking time to review these patches. En, deleted should be better, I will fix it in the next version. Thanks again. >> + pa->pa_type, pa->pa_pstart, pa->pa_lstart, >> + pa->pa_len); >> + return; >> + } >> + >> + pa->pa_deleted = 1; >> + >> + if (pa->pa_type == MB_INODE_PA) { >> + ei = EXT4_I(pa->pa_inode); >> + atomic_dec(&ei->i_prealloc_active); >> + } >> +} >> + >> static void ext4_mb_pa_callback(struct rcu_head *head) >> { >> struct ext4_prealloc_space *pa; >> @@ -3709,7 +3730,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, >> return; >> } >> - pa->pa_deleted = 1; >> + ext4_mb_mark_pa_deleted(sb, pa); >> spin_unlock(&pa->pa_lock); >> grp_blk = pa->pa_pstart; >> @@ -3833,6 +3854,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, >> spin_lock(pa->pa_obj_lock); >> list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); >> spin_unlock(pa->pa_obj_lock); >> + atomic_inc(&ei->i_prealloc_active); >> } >> /* >> @@ -4043,7 +4065,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) >> } >> /* seems this one can be freed ... */ >> - pa->pa_deleted = 1; >> + ext4_mb_mark_pa_deleted(sb, pa); >> /* we can trust pa_free ... */ >> free += pa->pa_free; >> @@ -4106,7 +4128,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) >> * >> * FIXME!! Make sure it is valid at all the call sites >> */ >> -void ext4_discard_preallocations(struct inode *inode) >> +void ext4_discard_preallocations(struct inode *inode, unsigned int needed) >> { >> struct ext4_inode_info *ei = EXT4_I(inode); >> struct super_block *sb = inode->i_sb; >> @@ -4124,15 +4146,19 @@ void ext4_discard_preallocations(struct inode *inode) >> mb_debug(sb, "discard preallocation for inode %lu\n", >> inode->i_ino); >> - trace_ext4_discard_preallocations(inode); >> + trace_ext4_discard_preallocations(inode, >> + atomic_read(&ei->i_prealloc_active), needed); >> INIT_LIST_HEAD(&list); >> + if (needed == 0) >> + needed = UINT_MAX; >> + >> repeat: >> /* first, collect all pa's in the inode */ >> spin_lock(&ei->i_prealloc_lock); >> - while (!list_empty(&ei->i_prealloc_list)) { >> - pa = list_entry(ei->i_prealloc_list.next, >> + while (!list_empty(&ei->i_prealloc_list) && needed) { >> + pa = list_entry(ei->i_prealloc_list.prev, >> struct ext4_prealloc_space, pa_inode_list); >> BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); >> spin_lock(&pa->pa_lock); >> @@ -4149,10 +4175,11 @@ void ext4_discard_preallocations(struct inode *inode) >> } >> if (pa->pa_deleted == 0) { >> - pa->pa_deleted = 1; >> + ext4_mb_mark_pa_deleted(sb, pa); >> spin_unlock(&pa->pa_lock); >> list_del_rcu(&pa->pa_inode_list); >> list_add(&pa->u.pa_tmp_list, &list); >> + needed--; >> continue; >> } >> @@ -4453,7 +4480,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) >> BUG_ON(pa->pa_type != MB_GROUP_PA); >> /* seems this one can be freed ... */ >> - pa->pa_deleted = 1; >> + ext4_mb_mark_pa_deleted(sb, pa); > > Isn't this inside ext4_mb_discard_lg_preallocations()? > ext4_mb_mark_pa_deleted() is not required here. Since there is anyway a > BUG_ON() at top which ensures that this is MB_GROUP_PA type only. > > -ritesh > > >> spin_unlock(&pa->pa_lock); >> list_del_rcu(&pa->pa_inode_list); >> @@ -4552,10 +4579,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) >> } >> /* >> + * if per-inode prealloc list is too long, trim some PA >> + */ >> +static void ext4_mb_trim_inode_pa(struct inode *inode) >> +{ >> + struct ext4_inode_info *ei = EXT4_I(inode); >> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); >> + int count, delta; >> + >> + count = atomic_read(&ei->i_prealloc_active); >> + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; >> + if (count > sbi->s_mb_max_inode_prealloc + delta) { >> + count -= sbi->s_mb_max_inode_prealloc; >> + ext4_discard_preallocations(inode, count); >> + } >> +} >> + >> +/* >> * release all resource we used in allocation >> */ >> static int ext4_mb_release_context(struct ext4_allocation_context *ac) >> { >> + struct inode *inode = ac->ac_inode; >> + struct ext4_inode_info *ei = EXT4_I(inode); >> struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); >> struct ext4_prealloc_space *pa = ac->ac_pa; >> if (pa) { >> @@ -4581,6 +4627,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) >> ext4_mb_add_n_trim(ac); >> } >> } >> + >> + if (pa->pa_type == MB_INODE_PA) { >> + /* >> + * treat per-inode prealloc list as a lru list, then try >> + * to trim the least recently used PA. >> + */ >> + spin_lock(pa->pa_obj_lock); >> + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); >> + spin_unlock(pa->pa_obj_lock); >> + } >> + >> ext4_mb_put_pa(ac, ac->ac_sb, pa); >> } >> if (ac->ac_bitmap_page) >> @@ -4590,6 +4647,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) >> if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) >> mutex_unlock(&ac->ac_lg->lg_mutex); >> ext4_mb_collect_stats(ac); >> + ext4_mb_trim_inode_pa(inode); >> return 0; >> } >> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h >> index 6b4d17c..e75b474 100644 >> --- a/fs/ext4/mballoc.h >> +++ b/fs/ext4/mballoc.h >> @@ -73,6 +73,10 @@ >> */ >> #define MB_DEFAULT_GROUP_PREALLOC 512 >> +/* >> + * maximum length of inode prealloc list >> + */ >> +#define MB_DEFAULT_MAX_INODE_PREALLOC 512 >> struct ext4_free_data { >> /* this links the free block information from sb_info */ >> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c >> index 1ed86fb..0d601b8 100644 >> --- a/fs/ext4/move_extent.c >> +++ b/fs/ext4/move_extent.c >> @@ -686,8 +686,8 @@ >> out: >> if (*moved_len) { >> - ext4_discard_preallocations(orig_inode); >> - ext4_discard_preallocations(donor_inode); >> + ext4_discard_preallocations(orig_inode, 0); >> + ext4_discard_preallocations(donor_inode, 0); >> } >> ext4_ext_drop_refs(path); >> diff --git a/fs/ext4/super.c b/fs/ext4/super.c >> index 330957e..4df5dde 100644 >> --- a/fs/ext4/super.c >> +++ b/fs/ext4/super.c >> @@ -1123,6 +1123,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) >> inode_set_iversion(&ei->vfs_inode, 1); >> spin_lock_init(&ei->i_raw_lock); >> INIT_LIST_HEAD(&ei->i_prealloc_list); >> + atomic_set(&ei->i_prealloc_active, 0); >> spin_lock_init(&ei->i_prealloc_lock); >> ext4_es_init_tree(&ei->i_es_tree); >> rwlock_init(&ei->i_es_lock); >> @@ -1216,7 +1217,7 @@ void ext4_clear_inode(struct inode *inode) >> { >> invalidate_inode_buffers(inode); >> clear_inode(inode); >> - ext4_discard_preallocations(inode); >> + ext4_discard_preallocations(inode, 0); >> ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); >> dquot_drop(inode); >> if (EXT4_I(inode)->jinode) { >> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c >> index 6c9fc9e..92f04e9 100644 >> --- a/fs/ext4/sysfs.c >> +++ b/fs/ext4/sysfs.c >> @@ -215,6 +215,7 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) >> EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); >> EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); >> EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); >> +EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); >> EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); >> EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); >> EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); >> @@ -257,6 +258,7 @@ static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) >> ATTR_LIST(mb_order2_req), >> ATTR_LIST(mb_stream_req), >> ATTR_LIST(mb_group_prealloc), >> + ATTR_LIST(mb_max_inode_prealloc), >> ATTR_LIST(max_writeback_mb_bump), >> ATTR_LIST(extent_max_zeroout_kb), >> ATTR_LIST(trigger_fs_error), >> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h >> index cc41d69..628db6a 100644 >> --- a/include/trace/events/ext4.h >> +++ b/include/trace/events/ext4.h >> @@ -746,24 +746,29 @@ >> ); >> TRACE_EVENT(ext4_discard_preallocations, >> - TP_PROTO(struct inode *inode), >> + TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed), >> - TP_ARGS(inode), >> + TP_ARGS(inode, len, needed), >> TP_STRUCT__entry( >> - __field( dev_t, dev ) >> - __field( ino_t, ino ) >> + __field( dev_t, dev ) >> + __field( ino_t, ino ) >> + __field( unsigned int, len ) >> + __field( unsigned int, needed ) >> ), >> TP_fast_assign( >> __entry->dev = inode->i_sb->s_dev; >> __entry->ino = inode->i_ino; >> + __entry->len = len; >> + __entry->needed = needed; >> ), >> - TP_printk("dev %d,%d ino %lu", >> + TP_printk("dev %d,%d ino %lu len: %u needed %u", >> MAJOR(__entry->dev), MINOR(__entry->dev), >> - (unsigned long) __entry->ino) >> + (unsigned long) __entry->ino, __entry->len, >> + __entry->needed) >> ); >> TRACE_EVENT(ext4_mb_discard_preallocations, >>