On Tue, Jul 08, 2014 at 08:44:51PM -0400, Theodore Ts'o wrote: > Add the mke2fs.conf configuration option which causes the hugefiles to > be aligned to the beginning of the disk. This is important if the the > reason for aligning the hugefiles is to support hard-drive specific > features such as Shingled Magnetic Recording (SMR). > > Signed-off-by: Theodore Ts'o <tytso@xxxxxxx> > --- > misc/mk_hugefiles.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++-- > misc/mke2fs.c | 2 +- > misc/mke2fs.conf.5.in | 7 +++ > misc/mke2fs.h | 2 +- > 4 files changed, 157 insertions(+), 8 deletions(-) > > diff --git a/misc/mk_hugefiles.c b/misc/mk_hugefiles.c > index b7a9840..ea42b6c 100644 > --- a/misc/mk_hugefiles.c > +++ b/misc/mk_hugefiles.c > @@ -3,9 +3,11 @@ > */ > > #define _XOPEN_SOURCE 600 /* for inclusion of PATH_MAX in Solaris */ > +#define _BSD_SOURCE /* for makedev() and major() */ > > #include "config.h" > #include <stdio.h> > +#include <stdarg.h> > #include <string.h> > #include <strings.h> > #include <fcntl.h> > @@ -60,6 +62,141 @@ static char *fn_buf; > static char *fn_numbuf; > int zero_hugefile = 1; > > +#define SYSFS_PATH_LEN 256 > +typedef char sysfs_path_t[SYSFS_PATH_LEN]; > + > +#ifndef HAVE_SNPRINTF > +/* > + * We are very careful to avoid needing to worry about buffer > + * overflows, so we don't really need to use snprintf() except as an > + * additional safety check. So if snprintf() is not present, it's > + * safe to fall back to vsprintf(). This provides portability since > + * vsprintf() is guaranteed by C89, while snprintf() is only > + * guaranteed by C99 --- which for example, Microsoft Visual Studio > + * has *still* not bothered to implement. :-/ (Not that I expect > + * mke2fs to be ported to MS Visual Studio any time soon, but > + * libext2fs *does* get built on Microsoft platforms, and we might > + * want to move this into libext2fs some day.) > + */ > +static int my_snprintf(char *str, size_t size, const char *format, ...) > +{ > + va_list ap; > + int ret; > + > + va_start(ap, format); > + ret = vsprintf(str, format, ap); > + va_end(ap); > + return ret; > +} > + > +#define snprintf my_snprintf > +#endif > + > +/* > + * Fall back to Linux's definitions of makedev and major are needed. > + * The search_sysfs_block() function is highly unlikely to work on > + * non-Linux systems anyway. > + */ > +#ifndef makedev > +#define makedev(maj, min) (((maj) << 8) + (min)) > +#endif > + > +static char *search_sysfs_block(dev_t devno, sysfs_path_t ret_path) > +{ > + struct dirent *de, *p_de; > + DIR *dir = NULL, *p_dir = NULL; > + FILE *f; > + sysfs_path_t path, p_path; > + unsigned int major, minor; > + char *ret = ret_path; > + > + ret_path[0] = 0; > + if ((dir = opendir("/sys/block")) == NULL) > + return NULL; > + while ((de = readdir(dir)) != NULL) { > + if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..") || > + strlen(de->d_name) > sizeof(path)-32) > + continue; > + snprintf(path, SYSFS_PATH_LEN, > + "/sys/block/%s/dev", de->d_name); > + f = fopen(path, "r"); > + if (f && > + (fscanf(f, "%u:%u", &major, &minor) == 2)) { > + fclose(f); f = NULL; > + if (makedev(major, minor) == devno) { > + snprintf(ret_path, SYSFS_PATH_LEN, > + "/sys/block/%s", de->d_name); > + goto success; > + } > +#ifdef major > + if (major(devno) != major) > + continue; > +#endif > + } > + if (f) > + fclose(f); > + > + snprintf(path, SYSFS_PATH_LEN, "/sys/block/%s", de->d_name); > + > + if (p_dir) > + closedir(p_dir); > + if ((p_dir = opendir(path)) == NULL) > + continue; > + while ((p_de = readdir(p_dir)) != NULL) { > + if (!strcmp(p_de->d_name, ".") || > + !strcmp(p_de->d_name, "..") || > + (strlen(p_de->d_name) > > + SYSFS_PATH_LEN - strlen(path) - 32)) > + continue; > + snprintf(p_path, SYSFS_PATH_LEN, "%s/%s/dev", > + path, p_de->d_name); > + > + f = fopen(p_path, "r"); > + if (f && > + (fscanf(f, "%u:%u", &major, &minor) == 2) && > + (((major << 8) + minor) == devno)) { > + fclose(f); > + snprintf(ret_path, SYSFS_PATH_LEN, "%s/%s", > + path, p_de->d_name); > + goto success; > + } > + if (f) > + fclose(f); > + } > + } > + ret = NULL; > +success: > + if (dir) > + closedir(dir); > + if (p_dir) > + closedir(p_dir); > + return ret; > +} > + > +static blk64_t get_partition_start(const char *device_name) > +{ > + unsigned long long start; > + sysfs_path_t path; > + struct stat st; > + FILE *f; > + char *cp; > + int n; > + > + if ((stat(device_name, &st) < 0) || !S_ISBLK(st.st_mode)) > + return 0; > + > + cp = search_sysfs_block(st.st_rdev, path); > + if (!cp) > + return 0; > + strncat(path, "/start", SYSFS_PATH_LEN); The third argument is the maximum number of bytes to concatenate from the second argument ("/start"). Though it's unlikely that we'll ever find anything in /sys/block approaching 255 characters, we might as well guard against stack corruption: if (strlen(path) > SYSFS_PATH_LEN - strlen("/start") - 1) return 0; strcat(path, "/start"); Oh, I guess coverity is complaining about this too. Though FWIW, "find /sys | while read f; do echo "$f" | wc -c; done | sort -g | tail -n 5" spits out "133" as the longest sysfs path ever. I guess that could be much longer on some multi-node NUMA box or whatever. <shrug> /me codes up a fix, tosses it on the patch pile. --D > + f = fopen(path, "r"); > + if (!f) > + return 0; > + n = fscanf(f, "%llu", &start); > + fclose(f); > + return (n == 1) ? start : 0; > +} > + > static errcode_t create_directory(ext2_filsys fs, char *dir, > ext2_ino_t *ret_ino) > > @@ -310,24 +447,26 @@ static blk64_t get_start_block(ext2_filsys fs, blk64_t slack) > return blk; > } > > -static blk64_t round_up_align(blk64_t b, unsigned long align) > +static blk64_t round_up_align(blk64_t b, unsigned long align, > + blk64_t part_offset) > { > unsigned long m; > > if (align == 0) > return b; > - m = b % align; > + part_offset = part_offset % align; > + m = (b + part_offset) % align; > if (m) > b += align - m; > return b; > } > > -errcode_t mk_hugefiles(ext2_filsys fs) > +errcode_t mk_hugefiles(ext2_filsys fs, const char *device_name) > { > unsigned long i; > ext2_ino_t dir; > errcode_t retval; > - blk64_t fs_blocks; > + blk64_t fs_blocks, part_offset; > unsigned long align; > int d, dsize; > char *t; > @@ -348,7 +487,10 @@ errcode_t mk_hugefiles(ext2_filsys fs) > t = get_string_from_profile(fs_types, "hugefiles_align", "0"); > align = parse_num_blocks2(t, fs->super->s_log_block_size); > free(t); > - num_blocks = round_up_align(num_blocks, align); > + if (get_bool_from_profile(fs_types, "hugefiles_align_disk", 0)) > + part_offset = get_partition_start(device_name) / > + (fs->blocksize / 512); > + num_blocks = round_up_align(num_blocks, align, 0); > zero_hugefile = get_bool_from_profile(fs_types, "zero_hugefiles", > zero_hugefile); > > @@ -400,7 +542,7 @@ errcode_t mk_hugefiles(ext2_filsys fs) > num_slack += calc_overhead(fs, num_blocks) * num_files; > num_slack += (num_files / 16) + 1; /* space for dir entries */ > goal = get_start_block(fs, num_slack); > - goal = round_up_align(goal, align); > + goal = round_up_align(goal, align, part_offset); > > if ((num_blocks ? num_blocks : fs_blocks) > > (0x80000000UL / fs->blocksize)) > diff --git a/misc/mke2fs.c b/misc/mke2fs.c > index ecd47e6..da77e3a 100644 > --- a/misc/mke2fs.c > +++ b/misc/mke2fs.c > @@ -2913,7 +2913,7 @@ no_journal: > EXT4_FEATURE_RO_COMPAT_QUOTA)) > create_quota_inodes(fs); > > - retval = mk_hugefiles(fs); > + retval = mk_hugefiles(fs, device_name); > if (retval) > com_err(program_name, retval, "while creating huge files"); > > diff --git a/misc/mke2fs.conf.5.in b/misc/mke2fs.conf.5.in > index 8e25892..19458ac 100644 > --- a/misc/mke2fs.conf.5.in > +++ b/misc/mke2fs.conf.5.in > @@ -480,6 +480,13 @@ files. It also forces the size of huge files to be a multiple of the > requested alignment. If this relation is not specified, no alignment > requirement will be imposed on the huge files. > .TP > +.I hugefiles_align_disk > +Thie relations specifies whether the alignment should be relative to the > +beginning of the hard drive (assuming that the starting offset of the > +partition is available to mke2fs). The default value is false, which > +if will cause hugefile alignment to be relative to the beginning of the > +file system. > +.TP > .I hugefiles_name > This relation specifies the base file name for the huge files. > .TP > diff --git a/misc/mke2fs.h b/misc/mke2fs.h > index 9fa6bfe..ce72cb3 100644 > --- a/misc/mke2fs.h > +++ b/misc/mke2fs.h > @@ -24,7 +24,7 @@ extern int get_bool_from_profile(char **types, const char *opt, int def_val); > extern int int_log10(unsigned long long arg); > > /* mk_hugefiles.c */ > -extern errcode_t mk_hugefiles(ext2_filsys fs); > +extern errcode_t mk_hugefiles(ext2_filsys fs, const char *device_name); > > > > -- > 2.0.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html