Re: [E2FSPROGS, RFC] mke2fs: New bitmap and inode table allocation for FLEX_BG

"Jose R. Santos" <jrs@xxxxxxxxxx> · Tue, 22 Apr 2008 09:18:47 -0500

On Tue, 22 Apr 2008 08:46:19 -0400
"Theodore Ts'o" <tytso@xxxxxxx> wrote:

> Change the way we allocate bitmaps and inode tables if the FLEX_BG
> feature is used at mke2fs time.  It places calculates a new offset for
> bitmaps and inode table base on the number of groups that the user
> wishes to pack together using the new "-G" option.  Creating a
> filesystem with 64 block groups in a flex group can be done by:
> 
> mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX
> 
> Signed-off-by: Jose R. Santos <jrs@xxxxxxxxxx>
> Signed-off-by: Valerie Clement <valerie.clement@xxxxxxxx>
> Signed-off-by: Theodore Ts'o <tytso@xxxxxxx>
> ---
>  lib/ext2fs/alloc_tables.c |  118 +++++++++++++++++++++++++++++++++++++++++++-
>  lib/ext2fs/initialize.c   |    7 +++
>  misc/mke2fs.8.in          |   16 ++++++
>  misc/mke2fs.c             |   35 ++++++++++++-
>  misc/mke2fs.conf.5.in     |    7 +++
>  5 files changed, 177 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
> index 9b4f0e5..d87585b 100644
> --- a/lib/ext2fs/alloc_tables.c
> +++ b/lib/ext2fs/alloc_tables.c
> @@ -27,18 +27,80 @@
>  #include "ext2_fs.h"
>  #include "ext2fs.h"
> 
> +/*
> + * This routine searches for free blocks that can allocate a full
> + * group of bitmaps or inode tables for a flexbg group.  Returns the
> + * block number with a correct offset were the bitmaps and inode
> + * tables can be allocated continously and in order.
> + */
> +static blk_t flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
> +			   ext2fs_block_bitmap bmap, int offset, int size)
> +{
> +	int		flexbg, flexbg_size, elem_size;
> +	blk_t		last_blk, first_free = 0;
> +	dgrp_t	       	last_grp;
> +
> +	flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> +	flexbg = group / flexbg_size;
> +
> +	if (size > fs->super->s_blocks_per_group / 8)
> +		size = fs->super->s_blocks_per_group / 8;
> +
> +	/*
> +	 * Dont do a long search if the previous block
> +	 * search is still valid.
> +	 */
> +	if (start_blk && group % flexbg_size) {
> +		if (size > flexbg_size)
> +			elem_size = fs->inode_blocks_per_group;
> +		else
> +			elem_size = 1;
> +		if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
> +						   size))
> +			return start_blk + elem_size;
> +	}
> +
> +	start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
> +	last_grp = group | (flexbg_size - 1);
> +	if (last_grp > fs->group_desc_count)
> +		last_grp = fs->group_desc_count;
> +	last_blk = ext2fs_group_last_block(fs, last_grp);
> +
> +	/* Find the first available block */
> +	if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
> +				   &first_free))
> +		return first_free;
> +
> +	if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
> +				   bmap, &first_free))
> +		return first_free;
> +
> +	return first_free;
> +}
> +
>  errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  				      ext2fs_block_bitmap bmap)
>  {
>  	errcode_t	retval;
>  	blk_t		group_blk, start_blk, last_blk, new_blk, blk;
> -	int		j;
> +	dgrp_t		last_grp;
> +	int		j, rem_grps, flexbg_size = 0;
> 
>  	group_blk = ext2fs_group_first_block(fs, group);
>  	last_blk = ext2fs_group_last_block(fs, group);
> 
>  	if (!bmap)
>  		bmap = fs->block_map;
> +
> +	if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
> +				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> +	    fs->super->s_log_groups_per_flex) {
> +		flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> +		last_grp = group | (flexbg_size - 1);
> +		rem_grps = last_grp - group;
> +		if (last_grp > fs->group_desc_count)
> +			last_grp = fs->group_desc_count;
> +	}
>  	
>  	/*
>  	 * Allocate the block and inode bitmaps, if necessary
> @@ -56,6 +118,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  	} else
>  		start_blk = group_blk;
> 
> +	if (flexbg_size) {
> +		int prev_block = 0;
> +		if (group && fs->group_desc[group-1].bg_block_bitmap)
> +			prev_block = fs->group_desc[group-1].bg_block_bitmap;
> +		start_blk = flexbg_offset(fs, group, prev_block, bmap,
> +						 0, rem_grps);
> +		last_blk = ext2fs_group_last_block(fs, last_grp);
> +	}
> +
>  	if (!fs->group_desc[group].bg_block_bitmap) {
>  		retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
>  						1, bmap, &new_blk);
> @@ -66,6 +137,22 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  			return retval;
>  		ext2fs_mark_block_bitmap(bmap, new_blk);
>  		fs->group_desc[group].bg_block_bitmap = new_blk;
> +		if (flexbg_size) {
> +			dgrp_t gr = ext2fs_group_of_blk(fs, new_blk);
> +			fs->group_desc[gr].bg_free_blocks_count--;
> +			fs->super->s_free_blocks_count--;
> +			fs->group_desc[gr].bg_flags &= ~EXT2_BG_BLOCK_UNINIT;
> +			ext2fs_group_desc_csum_set(fs, gr);
> +		}
> +	}
> +
> +	if (flexbg_size) {
> +		int prev_block = 0;
> +		if (group && fs->group_desc[group-1].bg_inode_bitmap)
> +			prev_block = fs->group_desc[group-1].bg_inode_bitmap;
> +		start_blk = flexbg_offset(fs, group, prev_block, bmap,
> +						 flexbg_size, rem_grps);
> +		last_blk = ext2fs_group_last_block(fs, last_grp);
>  	}
> 
>  	if (!fs->group_desc[group].bg_inode_bitmap) {
> @@ -78,11 +165,29 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  			return retval;
>  		ext2fs_mark_block_bitmap(bmap, new_blk);
>  		fs->group_desc[group].bg_inode_bitmap = new_blk;
> +		if (flexbg_size) {
> +			dgrp_t gr = ext2fs_group_of_blk(fs, new_blk);
> +			fs->group_desc[gr].bg_free_blocks_count--;
> +			fs->super->s_free_blocks_count--;
> +			fs->group_desc[gr].bg_flags &= ~EXT2_BG_BLOCK_UNINIT;
> +			ext2fs_group_desc_csum_set(fs, gr);
> +		}
>  	}
> 
>  	/*
>  	 * Allocate the inode table
>  	 */
> +	if (flexbg_size) {
> +		int prev_block = 0;
> +		if (group && fs->group_desc[group-1].bg_inode_table)
> +			prev_block = fs->group_desc[group-1].bg_inode_table;
> +		group_blk = flexbg_offset(fs, group, prev_block, bmap,
> +						 flexbg_size * 2,
> +						 fs->inode_blocks_per_group *
> +						 rem_grps);
> +		last_blk = ext2fs_group_last_block(fs, last_grp);
> +	}
> +
>  	if (!fs->group_desc[group].bg_inode_table) {
>  		retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
>  						fs->inode_blocks_per_group,
> @@ -91,12 +196,19 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
>  			return retval;
>  		for (j=0, blk = new_blk;
>  		     j < fs->inode_blocks_per_group;
> -		     j++, blk++)
> +		     j++, blk++) {
>  			ext2fs_mark_block_bitmap(bmap, blk);
> +			if (flexbg_size) {
> +				dgrp_t gr = ext2fs_group_of_blk(fs, blk);
> +				fs->group_desc[gr].bg_free_blocks_count--;
> +				fs->super->s_free_blocks_count--;
> +				fs->group_desc[gr].bg_flags &= ~EXT2_BG_BLOCK_UNINIT;
> +				ext2fs_group_desc_csum_set(fs, gr);
> +			}
> +		}
>  		fs->group_desc[group].bg_inode_table = new_blk;
>  	}
>  	ext2fs_group_desc_csum_set(fs, group);
> -
>  	return 0;
>  }
> 
> diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
> index 09e1008..396dd59 100644
> --- a/lib/ext2fs/initialize.c
> +++ b/lib/ext2fs/initialize.c
> @@ -159,6 +159,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
>  	set_field(s_first_meta_bg, 0);
>  	set_field(s_raid_stride, 0);		/* default stride size: 0 */
>  	set_field(s_raid_stripe_width, 0);	/* default stripe width: 0 */
> +	set_field(s_log_groups_per_flex, 0);
>  	set_field(s_flags, 0);
>  	if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
>  		retval = EXT2_ET_UNSUPP_FEATURE;
> @@ -374,6 +375,10 @@ ipg_retry:
>  	 * Note that although the block bitmap, inode bitmap, and
>  	 * inode table have not been allocated (and in fact won't be
>  	 * by this routine), they are accounted for nevertheless.
> +	 *
> +	 * If FLEX_BG meta-data grouping is used, only account for the
> +	 * superblock and group descriptors (the inode tables and
> +	 * bitmaps will be accounted for when allocated).
>  	 */
>  	super->s_free_blocks_count = 0;
>  	csum_flag = EXT2_HAS_RO_COMPAT_FEATURE(fs->super,
> @@ -390,6 +395,8 @@ ipg_retry:
>  			fs->group_desc[i].bg_flags |= EXT2_BG_INODE_UNINIT;
>  		}
>  		numblocks = ext2fs_reserve_super_and_bgd(fs, i, fs->block_map);
> +		if (fs->super->s_log_groups_per_flex)
> +			numblocks += 2 + fs->inode_blocks_per_group;
> 
>  		super->s_free_blocks_count += numblocks;
>  		fs->group_desc[i].bg_free_blocks_count = numblocks;
> diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
> index 1e9a203..aa068b3 100644
> --- a/misc/mke2fs.8.in
> +++ b/misc/mke2fs.8.in
> @@ -26,6 +26,10 @@ mke2fs \- create an ext2/ext3 filesystem
>  .I blocks-per-group
>  ]
>  [
> +.B \-G
> +.I number-of-groups
> +]
> +[
>  .B \-i
>  .I bytes-per-inode
>  ]
> @@ -245,6 +249,13 @@ option rather than manipulating the number of blocks per group.)
>  This option is generally used by developers who
>  are developing test cases.  
>  .TP
> +.BI \-G " number-of-groups"
> +Specify the number of block goups that will be packed together to
> +create one large virtual block group on an ext4 filesystem.  This
> +improves meta-data locality and performance on meta-data heavy
> +workloads.  The number of goups must be a power of 2 and may only be
> +specified if the flex_bg filesystem feature is enabled.
> +.TP
>  .BI \-i " bytes-per-inode"
>  Specify the bytes/inode ratio. 
>  .B mke2fs
> @@ -445,6 +456,11 @@ Use hashed b-trees to speed up lookups in large directories.
>  .B filetype
>  Store file type information in directory entries.
>  .TP
> +.B flex_bg
> +Allow bitmaps and inode tables for a block group to be placed anywhere
> +on the storage media (use with -G option to group meta-data in order
> +to create a large virtual block group).
> +.TP
>  .B has_journal
>  Create an ext3 journal (as if using the
>  .B \-j
> diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> index 61f45aa..e37e510 100644
> --- a/misc/mke2fs.c
> +++ b/misc/mke2fs.c
> @@ -98,8 +98,9 @@ static void usage(void)
>  	fprintf(stderr, _("Usage: %s [-c|-l filename] [-b block-size] "
>  	"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
>  	"[-J journal-options]\n"
> -	"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
> -	"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
> +	"\t[-G meta group size] [-N number-of-inodes]\n"
> +	"\t[-m reserved-blocks-percentage] [-o creator-os]\n"
> +	"\t[-g blocks-per-group] [-L volume-label] "
>  	"[-M last-mounted-directory]\n\t[-O feature[,...]] "
>  	"[-r fs-revision] [-E extended-option[,...]]\n"
>  	"\t[-T fs-type] [-jnqvFSV] device [blocks-count]\n"),
> @@ -1096,6 +1097,7 @@ static void PRS(int argc, char *argv[])
>  	int		blocksize = 0;
>  	int		inode_ratio = 0;
>  	int		inode_size = 0;
> +	unsigned long	flex_bg_size = 0;
>  	double		reserved_ratio = 5.0;
>  	int		sector_size = 0;
>  	int		show_version_only = 0;
> @@ -1180,7 +1182,7 @@ static void PRS(int argc, char *argv[])
>  	}
> 
>  	while ((c = getopt (argc, argv,
> -		    "b:cf:g:i:jl:m:no:qr:s:t:vE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
> +		    "b:cf:g:G:i:jl:m:no:qr:s:t:vE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
>  		switch (c) {
>  		case 'b':
>  			blocksize = strtol(optarg, &tmp, 0);
> @@ -1230,6 +1232,20 @@ static void PRS(int argc, char *argv[])
>  				exit(1);
>  			}
>  			break;
> +		case 'G':
> +			flex_bg_size = strtoul(optarg, &tmp, 0);
> +			if (*tmp) {
> +				com_err(program_name, 0,
> +					_("Illegal number for flex_bg size"));
> +				exit(1);
> +			}
> +			if (flex_bg_size < 2 ||
> +			    (flex_bg_size & (flex_bg_size-1)) != 0) {
> +				com_err(program_name, 0,
> +					_("flex_bg size must be a power of 2"));
> +				exit(1);
> +			}
> +			break;
>  		case 'i':
>  			inode_ratio = strtoul(optarg, &tmp, 0);
>  			if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
> @@ -1638,6 +1654,19 @@ static void PRS(int argc, char *argv[])
> 
>  	if (inode_size == 0)
>  		inode_size = get_int_from_profile(fs_types, "inode_size", 0);
> +	if (!flex_bg_size && (fs_param.s_feature_incompat &
> +			      EXT4_FEATURE_INCOMPAT_FLEX_BG))
> +		get_int_from_profile(fs_types, "flex_bg_size", 8);

A default of 256 block groups to pack seems a bit high base on some of
the performance testing that I've done.  At some point having the inodes
too far away from the data blocks begins to affect performance
(especially on read operations).  The optimum number of groups depends
a lot on platter density of the hard drive so I expect that we can
increase the default grouping size as time goes by.  Using 128 groups
as already showing performance degradation on read operations on some
of my smaller disks (147GB).  For now, I would change this to 6 (64
groups) as this is a good balance for both big an small disks.

> +	if (flex_bg_size) {
> +		if (!(fs_param.s_feature_incompat &
> +		      EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
> +			com_err(program_name, 0,
> +				_("Flex_bg feature not enabled, so "
> +				  "flex_bg size may not be specified"));
> +			exit(1);
> +		}
> +		fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> +	}
> 
>  	if (inode_size && fs_param.s_rev_level >= EXT2_DYNAMIC_REV) {
>  		if (inode_size < EXT2_GOOD_OLD_INODE_SIZE ||
> diff --git a/misc/mke2fs.conf.5.in b/misc/mke2fs.conf.5.in
> index 6734bf3..5dd92d8 100644
> --- a/misc/mke2fs.conf.5.in
> +++ b/misc/mke2fs.conf.5.in
> @@ -301,6 +301,13 @@ specify one on the command line.
>  .I inode_size
>  This relation specifies the default inode size if the user does not
>  specify one on the command line.
> +.TP
> +.I flex_bg_size
> +This relation specifies the number of block goups that will be packed
> +together to create one large virtual block group on an ext4 filesystem.
> +This improves meta-data locality and performance on meta-data heavy
> +workloads.  The number of goups must be a power of 2 and may only be
> +specified if the flex_bg filesystem feature is enabled.
>  .SH FILES
>  .TP
>  .I /etc/mke2fs.conf

-JRS
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html