Re: [PATCH v3] ext4: optimize metadata allocation for hybrid LUNs

Ritesh Harjani (IBM) <ritesh.list@xxxxxxxxx> · Wed, 20 Sep 2023 11:09:52 +0530

Bobi Jam <bobijam@xxxxxxxxxxx> writes:

> With LVM it is possible to create an LV with SSD storage at the
> beginning of the LV and HDD storage at the end of the LV, and use that
> to separate ext4 metadata allocations (that need small random IOs)
> from data allocations (that are better suited for large sequential
> IOs) depending on the type of underlying storage.  Between 0.5-1.0% of
> the filesystem capacity would need to be high-IOPS storage in order to
> hold all of the internal metadata.
>
> This would improve performance for inode and other metadata access,
> such as ls, find, e2fsck, and in general improve file access latency,
> modification, truncate, unlink, transaction commit, etc.
>
> This patch split largest free order group lists and average fragment
> size lists into other two lists for IOPS/fast storage groups, and
> cr 0 / cr 1 group scanning for metadata block allocation in following
> order:
>
> if (allocate metadata blocks)
>       if (cr == 0)
>               try to find group in largest free order IOPS group list
>       if (cr == 1)
>               try to find group in fragment size IOPS group list
>       if (above two find failed)
>               fall through normal group lists as before

Ok, so we are agreeing that if the iops groups are full, we will
fallback to non-iops group for metadata.

> if (allocate data blocks)
>       try to find group in normal group lists as before
>       if (failed to find group in normal group && mb_enable_iops_data)
>               try to find group in IOPS groups

same here but with mb_enable_iops_data.

>
> Non-metadata block allocation does not allocate from the IOPS groups
> if non-IOPS groups are not used up.

Sure. At least ENOSPC use case can be handled once mb_enable_iops_data
is enabled. (for users who might end up using large iops disk)

>
> Add for mke2fs an option to mark which blocks are in the IOPS region
> of storage at format time:
>
>   -E iops=0-1024G,4096-8192G
>
> so the ext4 mballoc code can then use the EXT4_BG_IOPS flag in the
> group descriptors to decide which groups to allocate dynamic
> filesystem metadata.
>
> Signed-off-by: Bobi Jam <bobijam@xxxxxxxxxxx
>
> --
> v2->v3: add sysfs mb_enable_iops_data to enable data block allocation
>         from IOPS groups.
> v1->v2: for metadata block allocation, search in IOPS list then normal
>         list.
> ---
>  fs/ext4/balloc.c   |   2 +-
>  fs/ext4/ext4.h     |  13 +++
>  fs/ext4/extents.c  |   5 +-
>  fs/ext4/indirect.c |   5 +-
>  fs/ext4/mballoc.c  | 229 +++++++++++++++++++++++++++++++++++++++++----
>  fs/ext4/sysfs.c    |   4 +
>  6 files changed, 234 insertions(+), 24 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index c1edde817be8..7b1b3ec2650c 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -739,7 +739,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
>  	ar.inode = inode;
>  	ar.goal = goal;
>  	ar.len = count ? *count : 1;
> -	ar.flags = flags;
> +	ar.flags = flags | EXT4_MB_HINT_METADATA;
>  
>  	ret = ext4_mb_new_blocks(handle, &ar, errp);
>  	if (count)
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8104a21b001a..a8f21f63f5ff 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -382,6 +382,7 @@ struct flex_groups {
>  #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
>  #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
>  #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
> +#define EXT4_BG_IOPS		0x0010 /* In IOPS/fast storage */

Not related to this patch. But why not 0x0008? Is it reserved for
anything else?

>  
>  /*
>   * Macro-instructions used to manage group descriptors
> @@ -1112,6 +1113,8 @@ struct ext4_inode_info {
>  #define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
>  #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
>  
> +#define EXT2_FLAGS_HAS_IOPS		0x0080	/* has IOPS storage */
> +

same here. Are the flag values in between 0x0004 and 0x0080 are reserved?

>  /*
>   * Mount flags set via mount options or defaults
>   */
> @@ -1514,8 +1517,12 @@ struct ext4_sb_info {
>  	atomic_t s_retry_alloc_pending;
>  	struct list_head *s_mb_avg_fragment_size;
>  	rwlock_t *s_mb_avg_fragment_size_locks;
> +	struct list_head *s_avg_fragment_size_list_iops;  /* avg_frament_size for IOPS groups */
> +	rwlock_t *s_avg_fragment_size_locks_iops;
>  	struct list_head *s_mb_largest_free_orders;
>  	rwlock_t *s_mb_largest_free_orders_locks;
> +	struct list_head *s_largest_free_orders_list_iops; /* largest_free_orders for IOPS grps */
> +	rwlock_t *s_largest_free_orders_locks_iops;
>  
>  	/* tunables */
>  	unsigned long s_stripe;
> @@ -1532,6 +1539,7 @@ struct ext4_sb_info {
>  	unsigned long s_mb_last_start;
>  	unsigned int s_mb_prefetch;
>  	unsigned int s_mb_prefetch_limit;
> +	unsigned int s_mb_enable_iops_data;
>  
>  	/* stats for buddy allocator */
>  	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
> @@ -3366,6 +3374,7 @@ struct ext4_group_info {
>  #define EXT4_GROUP_INFO_IBITMAP_CORRUPT		\
>  	(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
>  #define EXT4_GROUP_INFO_BBITMAP_READ_BIT	4
> +#define EXT4_GROUP_INFO_IOPS_BIT		5
>  
>  #define EXT4_MB_GRP_NEED_INIT(grp)	\
>  	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
> @@ -3382,6 +3391,10 @@ struct ext4_group_info {
>  	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
>  #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
>  	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GRP_TEST_IOPS(grp)	\
> +	(test_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state)))
> +#define EXT4_MB_GRP_SET_IOPS(grp)	\
> +	(set_bit(EXT4_GROUP_INFO_IOPS_BIT, &((grp)->bb_state)))
>  
>  #define EXT4_MAX_CONTENTION		8
>  #define EXT4_CONTENTION_THRESHOLD	2
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 35703dce23a3..6bfa784a3dad 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4272,11 +4272,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
>  	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
>  	ar.goal -= offset;
>  	ar.logical -= offset;
> -	if (S_ISREG(inode->i_mode))
> +	if (S_ISREG(inode->i_mode) &&
> +	    !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
>  		ar.flags = EXT4_MB_HINT_DATA;
>  	else
>  		/* disable in-core preallocation for non-regular files */
> -		ar.flags = 0;
> +		ar.flags = EXT4_MB_HINT_METADATA;
>  	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
>  		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
>  	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index c68bebe7ff4b..e1042c4e8ce6 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -610,8 +610,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
>  	memset(&ar, 0, sizeof(ar));
>  	ar.inode = inode;
>  	ar.logical = map->m_lblk;
> -	if (S_ISREG(inode->i_mode))
> +	if (S_ISREG(inode->i_mode) &&
> +	    !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
>  		ar.flags = EXT4_MB_HINT_DATA;
> +	else
> +		ar.flags = EXT4_MB_HINT_METADATA;
>  	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
>  		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
>  	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 20f67a260df5..a676d26eccbc 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -828,6 +828,8 @@ static void
>  mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
>  {
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	rwlock_t *afs_locks;
> +	struct list_head *afs_list;
>  	int new_order;
>  
>  	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
> @@ -838,20 +840,24 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
>  	if (new_order == grp->bb_avg_fragment_size_order)
>  		return;
>  
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS &&
> +	    EXT4_MB_GRP_TEST_IOPS(grp)) {
> +		afs_locks = sbi->s_avg_fragment_size_locks_iops;
> +		afs_list = sbi->s_avg_fragment_size_list_iops;
> +	} else {
> +		afs_locks = sbi->s_mb_avg_fragment_size_locks;
> +		afs_list = sbi->s_mb_avg_fragment_size;
> +	}
> +
>  	if (grp->bb_avg_fragment_size_order != -1) {
> -		write_lock(&sbi->s_mb_avg_fragment_size_locks[
> -					grp->bb_avg_fragment_size_order]);
> +		write_lock(&afs_locks[grp->bb_avg_fragment_size_order]);
>  		list_del(&grp->bb_avg_fragment_size_node);
> -		write_unlock(&sbi->s_mb_avg_fragment_size_locks[
> -					grp->bb_avg_fragment_size_order]);
> +		write_unlock(&afs_locks[grp->bb_avg_fragment_size_order]);
>  	}
>  	grp->bb_avg_fragment_size_order = new_order;
> -	write_lock(&sbi->s_mb_avg_fragment_size_locks[
> -					grp->bb_avg_fragment_size_order]);
> -	list_add_tail(&grp->bb_avg_fragment_size_node,
> -		&sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
> -	write_unlock(&sbi->s_mb_avg_fragment_size_locks[
> -					grp->bb_avg_fragment_size_order]);
> +	write_lock(&afs_locks[new_order]);
> +	list_add_tail(&grp->bb_avg_fragment_size_node, &afs_list[new_order]);
> +	write_unlock(&afs_locks[new_order]);
>  }
>  
>  /*
> @@ -986,6 +992,95 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
>  	return group + 1 >= ngroups ? 0 : group + 1;
>  }
>  
> +static bool ext4_mb_choose_next_iops_group_cr0(
> +			struct ext4_allocation_context *ac, ext4_group_t *group)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	struct ext4_group_info *iter, *grp;
> +	int i;
> +
> +	if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
> +		atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
> +
> +	grp = NULL;
> +	for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
> +		if (list_empty(&sbi->s_largest_free_orders_list_iops[i]))
> +			continue;
> +		read_lock(&sbi->s_largest_free_orders_locks_iops[i]);
> +		if (list_empty(&sbi->s_largest_free_orders_list_iops[i])) {
> +			read_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
> +			continue;
> +		}
> +		grp = NULL;
> +		list_for_each_entry(iter,
> +				    &sbi->s_largest_free_orders_list_iops[i],
> +				    bb_largest_free_order_node) {
> +			if (sbi->s_mb_stats)
> +				atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
> +			if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
> +				grp = iter;
> +				break;
> +			}
> +		}
> +		read_unlock(&sbi->s_largest_free_orders_locks_iops[i]);
> +		if (grp)
> +			break;
> +	}
> +
> +	if (grp) {
> +		*group = grp->bb_group;
> +		ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool ext4_mb_choose_next_iops_group_cr1(
> +			struct ext4_allocation_context *ac, ext4_group_t *group)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	struct ext4_group_info *grp = NULL, *iter;
> +	int i;
> +
> +	if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
> +		if (sbi->s_mb_stats)
> +			atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
> +	}
> +
> +	for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
> +	     i < MB_NUM_ORDERS(ac->ac_sb); i++) {
> +		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i]))
> +			continue;
> +		read_lock(&sbi->s_avg_fragment_size_locks_iops[i]);
> +		if (list_empty(&sbi->s_avg_fragment_size_list_iops[i])) {
> +			read_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
> +			continue;
> +		}
> +		list_for_each_entry(iter,
> +				    &sbi->s_avg_fragment_size_list_iops[i],
> +				    bb_avg_fragment_size_node) {
> +			if (sbi->s_mb_stats)
> +				atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
> +			if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
> +				grp = iter;
> +				break;
> +			}
> +		}
> +		read_unlock(&sbi->s_avg_fragment_size_locks_iops[i]);
> +		if (grp)
> +			break;
> +	}
> +
> +	if (grp) {
> +		*group = grp->bb_group;
> +		ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
>  /*
>   * ext4_mb_choose_next_group: choose next group for allocation.
>   *
> @@ -1002,6 +1097,10 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
>  static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
>  		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
>  {
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	bool alloc_metadata = ac->ac_flags & EXT4_MB_HINT_METADATA;
> +	bool ret = false;
> +
>  	*new_cr = ac->ac_criteria;
>  
>  	if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
> @@ -1009,11 +1108,37 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
>  		return;
>  	}
>  
> +	if (alloc_metadata && sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS) {
> +		if (*new_cr == 0)
> +			ret = ext4_mb_choose_next_iops_group_cr0(ac, group);
> +		if (!ret && *new_cr < 2)
> +			ret = ext4_mb_choose_next_iops_group_cr1(ac, group);

This is a bit confusing here. Say if *new_cr = 0 fails, then we return
ret = false and fallback to choosing xx_iops_group_cr1(). And say if we
were able to find a group which satisfies this allocation request we
return. But the caller never knows that we allocated using cr1 and not
cr0. Because we never updated *new_cr inside xx_iops_group_crX()

> +		if (ret)
> +			return;
> +		/*
> +		 * Cannot get metadata group from IOPS storage, fall through
> +		 * to slow storage.
> +		 */
> +		cond_resched();

Not sure after you fix above case, do we still require cond_resched()
here. Note we already have one in the for loop which iterates over all
the groups for a given ac_criteria.

> +	}
> +
>  	if (*new_cr == 0) {
>  		ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
>  	} else if (*new_cr == 1) {
>  		ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
>  	} else {
> +		/*
> +		 * Cannot get data group from slow storage, try IOPS storage
> +		 */
> +		if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS &&
> +		    !alloc_metadata && sbi->s_mb_enable_iops_data &&
> +		    *new_cr == 3) {
> +			if (ac->ac_2order)
> +				ret = ext4_mb_choose_next_iops_group_cr0(ac,
> +									 group);
> +			if (!ret)
> +				ext4_mb_choose_next_iops_group_cr1(ac, group);
> +		}

We might never come here in this else case because
should_optimize_scan() which we check in the beginning of this function
will return 0 and we will chose a next linear group for CR >= 2.

>  		/*
>  		 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
>  		 * bb_free. But until that happens, we should never come here.
> @@ -1030,6 +1155,8 @@ static void
>  mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
>  {
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	rwlock_t *lfo_locks;
> +	struct list_head *lfo_list;
>  	int i;
>  
>  	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
> @@ -1042,21 +1169,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
>  		return;
>  	}
>  
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS &&
> +	    EXT4_MB_GRP_TEST_IOPS(grp)) {
> +		lfo_locks = sbi->s_largest_free_orders_locks_iops;
> +		lfo_list = sbi->s_largest_free_orders_list_iops;
> +	} else {
> +		lfo_locks = sbi->s_mb_largest_free_orders_locks;
> +		lfo_list = sbi->s_mb_largest_free_orders;
> +	}
> +
>  	if (grp->bb_largest_free_order >= 0) {
> -		write_lock(&sbi->s_mb_largest_free_orders_locks[
> -					      grp->bb_largest_free_order]);
> +		write_lock(&lfo_locks[grp->bb_largest_free_order]);
>  		list_del_init(&grp->bb_largest_free_order_node);
> -		write_unlock(&sbi->s_mb_largest_free_orders_locks[
> -					      grp->bb_largest_free_order]);
> +		write_unlock(&lfo_locks[grp->bb_largest_free_order]);
>  	}
>  	grp->bb_largest_free_order = i;
>  	if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
> -		write_lock(&sbi->s_mb_largest_free_orders_locks[
> -					      grp->bb_largest_free_order]);
> -		list_add_tail(&grp->bb_largest_free_order_node,
> -		      &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
> -		write_unlock(&sbi->s_mb_largest_free_orders_locks[
> -					      grp->bb_largest_free_order]);
> +		write_lock(&lfo_locks[i]);
> +		list_add_tail(&grp->bb_largest_free_order_node, &lfo_list[i]);
> +		write_unlock(&lfo_locks[i]);
>  	}
>  }
>  
> @@ -2498,6 +2629,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
>  		goto out;
>  	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
>  		goto out;
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS &&
> +	    (ac->ac_flags & EXT4_MB_HINT_DATA) && EXT4_MB_GRP_TEST_IOPS(grp) &&
> +	    !sbi->s_mb_enable_iops_data)
> +		goto out;

since we already have a grp information here. Then checking for s_flags
and is redundant here right?

>  	if (should_lock) {
>  		__acquire(ext4_group_lock_ptr(sb, group));
>  		ext4_unlock_group(sb, group);
> @@ -3150,6 +3285,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>  	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>  	init_rwsem(&meta_group_info[i]->alloc_sem);
>  	meta_group_info[i]->bb_free_root = RB_ROOT;
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS &&
> +	    desc->bg_flags & EXT4_BG_IOPS)
> +		EXT4_MB_GRP_SET_IOPS(meta_group_info[i]);
>  	INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
>  	INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
>  	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
> @@ -3423,6 +3561,26 @@ int ext4_mb_init(struct super_block *sb)
>  		INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
>  		rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
>  	}
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS) {
> +		sbi->s_avg_fragment_size_list_iops =
> +			kmalloc_array(MB_NUM_ORDERS(sb),
> +				      sizeof(struct list_head), GFP_KERNEL);
> +		if (!sbi->s_avg_fragment_size_list_iops) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		sbi->s_avg_fragment_size_locks_iops =
> +			kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
> +				      GFP_KERNEL);
> +		if (!sbi->s_avg_fragment_size_locks_iops) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
> +			INIT_LIST_HEAD(&sbi->s_avg_fragment_size_list_iops[i]);
> +			rwlock_init(&sbi->s_avg_fragment_size_locks_iops[i]);
> +		}
> +	}
>  	sbi->s_mb_largest_free_orders =
>  		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
>  			GFP_KERNEL);
> @@ -3441,6 +3599,27 @@ int ext4_mb_init(struct super_block *sb)
>  		INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
>  		rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
>  	}
> +	if (sbi->s_es->s_flags & EXT2_FLAGS_HAS_IOPS) {
> +		sbi->s_largest_free_orders_list_iops =
> +			kmalloc_array(MB_NUM_ORDERS(sb),
> +				      sizeof(struct list_head), GFP_KERNEL);
> +		if (!sbi->s_largest_free_orders_list_iops) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		sbi->s_largest_free_orders_locks_iops =
> +			kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
> +				      GFP_KERNEL);
> +		if (!sbi->s_largest_free_orders_locks_iops) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
> +			INIT_LIST_HEAD(
> +				&sbi->s_largest_free_orders_list_iops[i]);
> +			rwlock_init(&sbi->s_largest_free_orders_locks_iops[i]);
> +		}
> +	}
>  
>  	spin_lock_init(&sbi->s_md_lock);
>  	sbi->s_mb_free_pending = 0;
> @@ -3481,6 +3660,8 @@ int ext4_mb_init(struct super_block *sb)
>  			sbi->s_mb_group_prealloc, sbi->s_stripe);
>  	}
>  
> +	sbi->s_mb_enable_iops_data = 0;
> +
>  	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
>  	if (sbi->s_locality_groups == NULL) {
>  		ret = -ENOMEM;
> @@ -3512,8 +3693,12 @@ int ext4_mb_init(struct super_block *sb)
>  out:
>  	kfree(sbi->s_mb_avg_fragment_size);
>  	kfree(sbi->s_mb_avg_fragment_size_locks);
> +	kfree(sbi->s_avg_fragment_size_list_iops);
> +	kfree(sbi->s_avg_fragment_size_locks_iops);
>  	kfree(sbi->s_mb_largest_free_orders);
>  	kfree(sbi->s_mb_largest_free_orders_locks);
> +	kfree(sbi->s_largest_free_orders_list_iops);
> +	kfree(sbi->s_largest_free_orders_locks_iops);
>  	kfree(sbi->s_mb_offsets);
>  	sbi->s_mb_offsets = NULL;
>  	kfree(sbi->s_mb_maxs);
> @@ -3582,8 +3767,12 @@ int ext4_mb_release(struct super_block *sb)
>  	}
>  	kfree(sbi->s_mb_avg_fragment_size);
>  	kfree(sbi->s_mb_avg_fragment_size_locks);
> +	kfree(sbi->s_avg_fragment_size_list_iops);
> +	kfree(sbi->s_avg_fragment_size_locks_iops);
>  	kfree(sbi->s_mb_largest_free_orders);
>  	kfree(sbi->s_mb_largest_free_orders_locks);
> +	kfree(sbi->s_largest_free_orders_list_iops);
> +	kfree(sbi->s_largest_free_orders_locks_iops);
>  	kfree(sbi->s_mb_offsets);
>  	kfree(sbi->s_mb_maxs);
>  	iput(sbi->s_buddy_cache);
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 3042bc605bbf..86ab6c4ed3b8 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -245,6 +245,7 @@ EXT4_ATTR(journal_task, 0444, journal_task);
>  EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
>  EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
>  EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
> +EXT4_RW_ATTR_SBI_UI(mb_enable_iops_data, s_mb_enable_iops_data);
>  
>  static unsigned int old_bump_val = 128;
>  EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -295,6 +296,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(mb_prefetch),
>  	ATTR_LIST(mb_prefetch_limit),
>  	ATTR_LIST(last_trim_minblks),
> +	ATTR_LIST(mb_enable_iops_data),
>  	NULL,
>  };
>  ATTRIBUTE_GROUPS(ext4);
> @@ -318,6 +320,7 @@ EXT4_ATTR_FEATURE(fast_commit);
>  #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
>  EXT4_ATTR_FEATURE(encrypted_casefold);
>  #endif
> +EXT4_ATTR_FEATURE(iops);
>  
>  static struct attribute *ext4_feat_attrs[] = {
>  	ATTR_LIST(lazy_itable_init),
> @@ -338,6 +341,7 @@ static struct attribute *ext4_feat_attrs[] = {
>  #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
>  	ATTR_LIST(encrypted_casefold),
>  #endif
> +	ATTR_LIST(iops),
>  	NULL,
>  };
>  ATTRIBUTE_GROUPS(ext4_feat);
> -- 
> 2.42.0