[PATCH 2/2] mke2fs: add "-E iops" to set IOPS storage group

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



With LVM it is possible to create an LV with SSD storage at the
beginning of the LV and HDD storage at the end of the LV, and use that
to separate ext4 metadata allocations (that need small random IOs)
from data allocations (that are better suited for large sequential
IOs) depending on the type of underlying storage.  Between 0.5-1.0% of
the filesystem capacity would need to be high-IOPS storage in order to
hold all of the internal metadata.

This would improve performance for inode and other metadata access,
such as ls, find, e2fsck, and in general improve file access latency,
modification, truncate, unlink, transaction commit, etc.

For mke2fs, using the sparse_super2 and packed_meta_blocks options
places all of the static metadata (group descriptors, block/inode
bitmaps, inode tables, journal) at the start of the device in the
(IOPS) flash region.

Add an option to mark which blocks are in the IOPS region of storage
at format time:

  -E iops=0-1024G,4096-8192G

so the ext4 mballoc code can then use the EXT4_BG_IOPS flag in the
group descriptors to decide which groups to allocate dynamic
filesystem metadata.

Change-Id: I13cc2820c71737848eab8a2d6e246748258a64df
Signed-off-by: Bobi Jam <bobijam@xxxxxxxxxxx>
---
 debugfs/debugfs.c    |   2 +
 lib/e2p/ls.c         |   4 ++
 lib/ext2fs/ext2_fs.h |   2 +
 misc/dumpe2fs.c      |   2 +
 misc/mke2fs.8.in     |   8 +++
 misc/mke2fs.c        | 150 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 168 insertions(+)

diff --git a/debugfs/debugfs.c b/debugfs/debugfs.c
index 9b6321dc..81c51de1 100644
--- a/debugfs/debugfs.c
+++ b/debugfs/debugfs.c
@@ -515,6 +515,8 @@ void do_show_super_stats(int argc, char *argv[],
 			      &first, out);
 		print_bg_opts(current_fs, i, EXT2_BG_BLOCK_UNINIT, "Block not init",
 			      &first, out);
+		print_bg_opts(current_fs, i, EXT2_BG_IOPS, "IOPS",
+			      &first, out);
 		if (gdt_csum) {
 			fprintf(out, "%sChecksum 0x%04x",
 				first ? "           [":", ", ext2fs_bg_checksum(current_fs, i));
diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c
index 0b74aea2..c13927c6 100644
--- a/lib/e2p/ls.c
+++ b/lib/e2p/ls.c
@@ -162,6 +162,10 @@ static void print_super_flags(struct ext2_super_block * s, FILE *f)
 		fputs("test_filesystem ", f);
 		flags_found++;
 	}
+	if (s->s_flags & EXT2_FLAGS_HAS_IOPS) {
+		fputs("iops ", f);
+		flags_found++;
+	}
 	if (flags_found)
 		fputs("\n", f);
 	else
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index fb69e964..ea26d356 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -223,6 +223,7 @@ struct ext4_group_desc
 #define EXT2_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not initialized */
 #define EXT2_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not initialized */
 #define EXT2_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+#define EXT2_BG_IOPS		0x0010 /* In IOPS/fast storage */
 
 /*
  * Data structures used by the directory indexing feature
@@ -572,6 +573,7 @@ struct ext2_inode *EXT2_INODE(struct ext2_inode_large *large_inode)
 #define EXT2_FLAGS_IS_SNAPSHOT		0x0010	/* This is a snapshot image */
 #define EXT2_FLAGS_FIX_SNAPSHOT		0x0020	/* Snapshot inodes corrupted */
 #define EXT2_FLAGS_FIX_EXCLUDE		0x0040	/* Exclude bitmaps corrupted */
+#define EXT2_FLAGS_HAS_IOPS		0x0080	/* has IOPS storage */
 
 /*
  * Mount flags
diff --git a/misc/dumpe2fs.c b/misc/dumpe2fs.c
index 7c080ed9..c6e43d3a 100644
--- a/misc/dumpe2fs.c
+++ b/misc/dumpe2fs.c
@@ -131,6 +131,8 @@ static void print_bg_opts(ext2_filsys fs, dgrp_t i)
  		     &first);
 	print_bg_opt(bg_flags, EXT2_BG_INODE_ZEROED, "ITABLE_ZEROED",
  		     &first);
+	print_bg_opt(bg_flags, EXT2_BG_IOPS, "IOPS",
+		     &first);
 	if (!first)
 		fputc(']', stdout);
 	fputc('\n', stdout);
diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
index 30f97bb5..2d1bc829 100644
--- a/misc/mke2fs.8.in
+++ b/misc/mke2fs.8.in
@@ -435,6 +435,14 @@ effect only if the
 feature is set.   The default quota types to be initialized if this
 option is not specified is both user and group quotas.  If the project
 feature is enabled that project quotas will be initialized as well.
+.TP
+.BI iops= <size_range>[:<size_range>][...]
+Specify IOPS block group size range like:
+.B iops=0-1024G:4096-8192G
+So the file system can get the knowledge that which block groups to be accessed
+are on a relatively faster storage and allow the kernel block allocator to
+optimize metadata allocations onto high-IOPS storage for a hybrid flash/HDD
+devices for better performance.
 .RE
 .TP
 .B \-F
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index c69efe39..ccfcf3d1 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -103,6 +103,10 @@ static __u64	offset;
 static blk64_t journal_location = ~0LL;
 static int	proceed_delay = -1;
 static blk64_t	dev_size;
+blk64_t		iops_array[64];
+unsigned int	iops_size = sizeof(iops_array);
+unsigned int	iops_count = 0;
+blk64_t		*iops_range = iops_array;
 
 static struct ext2_super_block fs_param;
 static __u32 zero_buf[4];
@@ -742,6 +746,54 @@ static int set_os(struct ext2_super_block *sb, char *os)
 	return 1;
 }
 
+static int parse_range(char *p_start, char *p_end, char *p_hyphen)
+{
+	blk64_t start, end;
+	blk64_t *new_array;
+
+	/**
+	 * e.g  0-1024G
+	 *      ^      ^
+	 *      |      |
+	 *   p_start  p_end
+	 */
+	end = parse_num_blocks(p_hyphen + 1, -1);
+
+	if (!isdigit(*(p_end - 1)) && isdigit(*(p_hyphen -1))) {
+		/* copy G/M/K unit to start value */
+		*p_hyphen = *(p_end - 1);
+		p_hyphen++;
+	}
+	*p_hyphen = 0;
+
+	start = parse_num_blocks(p_start, -1);
+
+	/* add to iops_range */
+	if (iops_count == iops_size) {
+		iops_size <<= 1;
+		if (iops_size == 0) {
+			iops_size = iops_count;
+			return -E2BIG;
+		}
+		if (iops_range == iops_array)
+			new_array = malloc(iops_size * sizeof(blk64_t));
+		else
+			new_array = realloc(iops_range,
+					    iops_size * sizeof(blk64_t));
+		if (!new_array) {
+			iops_size >>= 1;
+			return -ENOMEM;
+		} else {
+			iops_range = new_array;
+		}
+	}
+
+	iops_range[iops_count++] = start;
+	iops_range[iops_count++] = end;
+
+	return 0;
+}
+
 #define PATH_SET "PATH=/sbin"
 
 static void parse_extended_opts(struct ext2_super_block *param,
@@ -1059,6 +1111,62 @@ static void parse_extended_opts(struct ext2_super_block *param,
 				r_usage++;
 				continue;
 			}
+		} else if (!strcmp(token, "iops")) {
+			char *p_colon, *p_hyphen;
+			blk64_t start, end;
+
+			/* example: iops=0-1024G:4096-8192G */
+
+			if (!arg) {
+				r_usage++;
+				badopt = token;
+				continue;
+			}
+			p_colon = strchr(arg, ':');
+			while (p_colon != NULL) {
+				*p_colon = 0;
+
+				p_hyphen = strchr(arg, '-');
+				if (p_hyphen == NULL) {
+					fprintf(stderr,
+						_("error: parse iops %s\n"),
+						arg);
+					r_usage++;
+					badopt = token;
+					break;
+				}
+
+				ret = parse_range(arg, p_colon, p_hyphen);
+				if (ret < 0) {
+					fprintf(stderr,
+						_("error: parse iops %s:%d\n"),
+						arg, ret);
+					r_usage++;
+					badopt = token;
+					break;
+				}
+
+				arg = p_colon + 1;
+				p_colon = strchr(arg, ':');
+			}
+			p_hyphen = strchr(arg, '-');
+			if (p_hyphen == NULL) {
+				fprintf(stderr,
+					_("error: parse iops %s\n"), arg);
+				r_usage++;
+				badopt = token;
+				continue;
+			}
+
+			ret = parse_range(arg, arg + strlen(arg), p_hyphen);
+			if (ret	< 0) {
+				fprintf(stderr,
+					_("error: parse iops %s:%d\n"),
+					arg, ret);
+				r_usage++;
+				badopt = token;
+				continue;
+			}
 		} else {
 			r_usage++;
 			badopt = token;
@@ -1085,10 +1193,13 @@ static void parse_extended_opts(struct ext2_super_block *param,
 			"\tnodiscard\n"
 			"\tencoding=<encoding>\n"
 			"\tencoding_flags=<flags>\n"
+			"\tiops=<iops storage size range>\n"
 			"\tquotatype=<quota type(s) to be enabled>\n"
 			"\tassume_storage_prezeroed=<0 to disable, 1 to enable>\n\n"),
 			badopt ? badopt : "");
 		free(buf);
+		if (iops_range != iops_array)
+			free(iops_range);
 		exit(1);
 	}
 	if (param->s_raid_stride &&
@@ -2973,6 +3084,35 @@ try_user:
 	return 0;
 }
 
+static int ext2fs_group_in_range(ext2_filsys fs, dgrp_t group,
+				 blk64_t *array, int count)
+{
+	int i;
+	blk64_t grp_off = group * EXT2_BLOCKS_PER_GROUP(fs->super) *
+			  fs->blocksize;
+
+	for (i = 0; i < count; i += 2) {
+		if (grp_off >= array[i] && grp_off < array[i + 1])
+			return 1;
+	}
+	return 0;
+}
+
+static void ext2fs_set_iops_group(ext2_filsys fs, blk64_t *array, int count)
+{
+	dgrp_t i;
+
+	if (!array || !count)
+		return;
+
+	for (i = 0; i < fs->group_desc_count; i++) {
+		if (ext2fs_group_in_range(fs, i, array, count)) {
+			ext2fs_bg_flags_set(fs, i, EXT2_BG_IOPS);
+			ext2fs_group_desc_csum_set(fs, i);
+		}
+	}
+}
+
 int main (int argc, char *argv[])
 {
 	errcode_t	retval = 0;
@@ -3054,6 +3194,16 @@ int main (int argc, char *argv[])
 			_("while setting up superblock"));
 		exit(1);
 	}
+
+	if (iops_range && iops_count) {
+		ext2fs_set_iops_group(fs, iops_range, iops_count);
+		fs->super->s_flags |= EXT2_FLAGS_HAS_IOPS;
+		ext2fs_mark_super_dirty(fs);
+
+		if (iops_range != iops_array)
+			free(iops_range);
+	}
+
 	fs->progress_ops = &ext2fs_numeric_progress_ops;
 
 	/* Set the error behavior */
-- 
2.41.0




[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux