Split largest free order group lists and average fragment size lists into other two lists for IOPS/fast storage groups, and cr 0 / cr 1 group scanning for metadata block allocation in following order: cr 0 on largest free order IOPS group list cr 1 on average fragment size IOPS group list cr 0 on largest free order non-IOPS group list cr 1 on average fragment size non-IOPS group list cr >= 2 perform the linear search as before Non-metadata block allocation does not allocate from the IOPS groups. Signed-off-by: Bobi Jam <bobijam@xxxxxxxxxxx> --- fs/ext4/balloc.c | 2 +- fs/ext4/ext4.h | 12 +++++ fs/ext4/mballoc.c | 154 ++++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 134 insertions(+), 34 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c1edde8..7b1b3ec 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -739,7 +739,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; - ar.flags = flags; + ar.flags = flags | EXT4_MB_HINT_METADATA; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8104a21..3444b6e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -382,6 +382,7 @@ struct flex_groups { #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ +#define EXT4_BG_IOPS 0x0010 /* In IOPS/fast storage */ /* * Macro-instructions used to manage group descriptors @@ -1112,6 +1113,8 @@ struct ext4_inode_info { #define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ +#define EXT2_FLAGS_HAS_IOPS 0x0080 /* has IOPS storage */ + /* * Mount flags set via mount options or defaults */ @@ -1514,8 +1517,12 @@ struct ext4_sb_info { atomic_t s_retry_alloc_pending; struct list_head *s_mb_avg_fragment_size; rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_avg_fragment_size_list_iops; /* avg_frament_size for IOPS groups */ + rwlock_t *s_avg_fragment_size_locks_iops; struct list_head *s_mb_largest_free_orders; rwlock_t *s_mb_largest_free_orders_locks; + struct list_head *s_largest_free_orders_list_iops; /* largest_free_orders for IOPS grps */ + rwlock_t *s_largest_free_orders_locks_iops; /* tunables */ unsigned long s_stripe; @@ -3366,6 +3373,7 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 +#define EXT4_GROUP_INFO_IOPS_BIT 5 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) @@ -3382,6 +3390,10 @@ struct ext4_group_info { (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_IOPS(grp) \ + (test_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_IOPS(grp) \ + (set_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20f67a2..6d218af 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -828,6 +828,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); + rwlock_t *afs_locks; + struct list_head *afs_list; int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) @@ -838,20 +840,23 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) if (new_order == grp->bb_avg_fragment_size_order) return; + if (EXT4_MB_GRP_TEST_IOPS(grp)) { + afs_locks = sbi->s_avg_fragment_size_locks_iops; + afs_list = sbi->s_avg_fragment_size_list_iops; + } else { + afs_locks = sbi->s_mb_avg_fragment_size_locks; + afs_list = sbi->s_mb_avg_fragment_size; + } + if (grp->bb_avg_fragment_size_order != -1) { - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + write_lock(&afs_locks[grp->bb_avg_fragment_size_order]); list_del(&grp->bb_avg_fragment_size_node); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + write_unlock(&afs_locks[grp->bb_avg_fragment_size_order]); } grp->bb_avg_fragment_size_order = new_order; - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_add_tail(&grp->bb_avg_fragment_size_node, - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + write_lock(&afs_locks[new_order]); + list_add_tail(&grp->bb_avg_fragment_size_node, &afs_list[new_order]); + write_unlock(&afs_locks[new_order]); } /* @@ -863,6 +868,10 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter, *grp; + bool iops = ac->ac_flags & EXT4_MB_HINT_METADATA && + ac->ac_sb->s_flags & EXT2_FLAGS_HAS_IOPS; + rwlock_t *lfo_locks; + struct list_head *lfo_list; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -871,17 +880,25 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + if (iops) { + lfo_locks = sbi->s_largest_free_orders_locks_iops; + lfo_list = sbi->s_largest_free_orders_list_iops; + } else { + lfo_locks = sbi->s_mb_largest_free_orders_locks; + lfo_list = sbi->s_mb_largest_free_orders; + } + grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_largest_free_orders[i])) + if (list_empty(&lfo_list[i])) continue; - read_lock(&sbi->s_mb_largest_free_orders_locks[i]); - if (list_empty(&sbi->s_mb_largest_free_orders[i])) { - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + read_lock(&lfo_locks[i]); + if (list_empty(&lfo_list[i])) { + read_unlock(&lfo_locks[i]); continue; } grp = NULL; - list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], + list_for_each_entry(iter, &lfo_list[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); @@ -890,7 +907,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, break; } } - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + read_unlock(&lfo_locks[i]); if (grp) break; } @@ -913,6 +930,10 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL, *iter; + bool iops = ac->ac_flags & EXT4_MB_HINT_METADATA && + ac->ac_sb->s_flags & EXT2_FLAGS_HAS_IOPS; + rwlock_t *afs_locks; + struct list_head *afs_list; int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { @@ -920,16 +941,24 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, atomic_inc(&sbi->s_bal_cr1_bad_suggestions); } + if (iops) { + afs_locks = sbi->s_avg_fragment_size_locks_iops; + afs_list = sbi->s_avg_fragment_size_list_iops; + } else { + afs_locks = sbi->s_mb_avg_fragment_size_locks; + afs_list = sbi->s_mb_avg_fragment_size; + } + for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) + if (list_empty(&afs_list[i])) continue; - read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + read_lock(&afs_locks[i]); + if (list_empty(&afs_list[i])) { + read_unlock(&afs_locks[i]); continue; } - list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], + list_for_each_entry(iter, &afs_list[i], bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); @@ -938,7 +967,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, break; } } - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + read_unlock(&afs_locks[i]); if (grp) break; } @@ -947,7 +976,15 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { - *new_cr = 2; + if (iops) { + /* cannot find proper group in IOPS storage, + * fall back to cr0 for non-IOPS groups. + */ + ac->ac_flags &= ~EXT4_MB_HINT_METADATA; + *new_cr = 0; + } else { + *new_cr = 2; + } } } @@ -1030,6 +1067,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); + rwlock_t *lfo_locks; + struct list_head *lfo_list; int i; for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) @@ -1042,21 +1081,24 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } + if (EXT4_MB_GRP_TEST_IOPS(grp)) { + lfo_locks = sbi->s_largest_free_orders_locks_iops; + lfo_list = sbi->s_largest_free_orders_list_iops; + } else { + lfo_locks = sbi->s_mb_largest_free_orders_locks; + lfo_list = sbi->s_mb_largest_free_orders; + } + if (grp->bb_largest_free_order >= 0) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + write_lock(&lfo_locks[grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + write_unlock(&lfo_locks[grp->bb_largest_free_order]); } grp->bb_largest_free_order = i; if (grp->bb_largest_free_order >= 0 && grp->bb_free) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_add_tail(&grp->bb_largest_free_order_node, - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + write_lock(&lfo_locks[i]); + list_add_tail(&grp->bb_largest_free_order_node, &lfo_list[i]); + write_unlock(&lfo_locks[i]); } } @@ -3150,6 +3192,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + if (desc->bg_flags & EXT4_BG_IOPS) + EXT4_MB_GRP_SET_IOPS(meta_group_info[i]); INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ @@ -3423,6 +3467,24 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); } + sbi->s_avg_fragment_size_list_iops = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_avg_fragment_size_list_iops) { + ret = -ENOMEM; + goto out; + } + sbi->s_avg_fragment_size_locks_iops = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_avg_fragment_size_locks_iops) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_avg_fragment_size_list_iops[i]); + rwlock_init(&sbi->s_avg_fragment_size_locks_iops[i]); + } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); @@ -3441,6 +3503,24 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } + sbi->s_largest_free_orders_list_iops = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_largest_free_orders_list_iops) { + ret = -ENOMEM; + goto out; + } + sbi->s_largest_free_orders_locks_iops = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_largest_free_orders_locks_iops) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_largest_free_orders_list_iops[i]); + rwlock_init(&sbi->s_largest_free_orders_locks_iops[i]); + } spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; @@ -3512,8 +3592,12 @@ int ext4_mb_init(struct super_block *sb) out: kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_avg_fragment_size_list_iops); + kfree(sbi->s_avg_fragment_size_locks_iops); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_largest_free_orders_list_iops); + kfree(sbi->s_largest_free_orders_locks_iops); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -3582,8 +3666,12 @@ int ext4_mb_release(struct super_block *sb) } kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_avg_fragment_size_list_iops); + kfree(sbi->s_avg_fragment_size_locks_iops); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_largest_free_orders_list_iops); + kfree(sbi->s_largest_free_orders_locks_iops); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); -- 1.8.3.1