On Fri, Aug 08, 2014 at 09:26:43PM -0700, Darrick J. Wong wrote: > This patch adds to e2fsck the ability to pre-fetch metadata into the > page cache in the hopes of speeding up fsck runs. There are two new > functions -- the first allows a caller to readahead a list of blocks, > and the second is a helper function that uses that first mechanism to > load group data (bitmaps, inode tables). > > These new e2fsck routines require the addition of a dblist API to > allow us to iterate a subset of a dblist. This will enable > incremental directory block readahead in e2fsck pass 2. > > There's also a function to estimate the readahead given a FS. > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > configure | 2 > configure.in | 1 > e2fsck/Makefile.in | 8 +- > e2fsck/e2fsck.h | 18 ++++ > e2fsck/readahead.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++ > e2fsck/util.c | 51 ++++++++++++ > lib/config.h.in | 3 + > lib/ext2fs/dblist.c | 21 ++++- > lib/ext2fs/ext2fs.h | 10 ++ > 9 files changed, 319 insertions(+), 8 deletions(-) > create mode 100644 e2fsck/readahead.c > > > diff --git a/configure b/configure > index 8ad1408..71778e1 100755 > --- a/configure > +++ b/configure > @@ -12404,7 +12404,7 @@ fi > done > > fi > -for ac_header in dirent.h errno.h execinfo.h getopt.h malloc.h mntent.h paths.h semaphore.h setjmp.h signal.h stdarg.h stdint.h stdlib.h termios.h termio.h unistd.h utime.h attr/xattr.h linux/falloc.h linux/fd.h linux/major.h linux/loop.h net/if_dl.h netinet/in.h sys/disklabel.h sys/disk.h sys/file.h sys/ioctl.h sys/mkdev.h sys/mman.h sys/mount.h sys/prctl.h sys/resource.h sys/select.h sys/socket.h sys/sockio.h sys/stat.h sys/syscall.h sys/sysmacros.h sys/time.h sys/types.h sys/un.h sys/wait.h > +for ac_header in dirent.h errno.h execinfo.h getopt.h malloc.h mntent.h paths.h semaphore.h setjmp.h signal.h stdarg.h stdint.h stdlib.h termios.h termio.h unistd.h utime.h attr/xattr.h linux/falloc.h linux/fd.h linux/major.h linux/loop.h net/if_dl.h netinet/in.h sys/disklabel.h sys/disk.h sys/file.h sys/ioctl.h sys/mkdev.h sys/mman.h sys/mount.h sys/prctl.h sys/resource.h sys/select.h sys/socket.h sys/sockio.h sys/stat.h sys/syscall.h sys/sysctl.h sys/sysmacros.h sys/time.h sys/types.h sys/un.h sys/wait.h > do : > as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` > ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" > diff --git a/configure.in b/configure.in > index 3c0d64f..e881204 100644 > --- a/configure.in > +++ b/configure.in > @@ -941,6 +941,7 @@ AC_CHECK_HEADERS(m4_flatten([ > sys/sockio.h > sys/stat.h > sys/syscall.h > + sys/sysctl.h > sys/sysmacros.h > sys/time.h > sys/types.h > diff --git a/e2fsck/Makefile.in b/e2fsck/Makefile.in > index c40b188..3a9d7b5 100644 > --- a/e2fsck/Makefile.in > +++ b/e2fsck/Makefile.in > @@ -62,7 +62,7 @@ OBJS= dict.o unix.o e2fsck.o super.o pass1.o pass1b.o pass2.o \ > pass3.o pass4.o pass5.o journal.o badblocks.o util.o dirinfo.o \ > dx_dirinfo.o ehandler.o problem.o message.o quota.o recovery.o \ > region.o revoke.o ea_refcount.o rehash.o profile.o prof_err.o \ > - logfile.o sigcatcher.o $(MTRACE_OBJ) > + logfile.o sigcatcher.o readahead.o $(MTRACE_OBJ) > > PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \ > profiled/super.o profiled/pass1.o profiled/pass1b.o \ > @@ -73,7 +73,7 @@ PROFILED_OBJS= profiled/dict.o profiled/unix.o profiled/e2fsck.o \ > profiled/recovery.o profiled/region.o profiled/revoke.o \ > profiled/ea_refcount.o profiled/rehash.o profiled/profile.o \ > profiled/prof_err.o profiled/logfile.o \ > - profiled/sigcatcher.o > + profiled/sigcatcher.o profiled/readahead.o > > SRCS= $(srcdir)/e2fsck.c \ > $(srcdir)/dict.c \ > @@ -97,6 +97,7 @@ SRCS= $(srcdir)/e2fsck.c \ > $(srcdir)/message.c \ > $(srcdir)/ea_refcount.c \ > $(srcdir)/rehash.c \ > + $(srcdir)/readahead.c \ > $(srcdir)/region.c \ > $(srcdir)/profile.c \ > $(srcdir)/sigcatcher.c \ > @@ -527,3 +528,6 @@ quota.o: $(srcdir)/quota.c $(top_builddir)/lib/config.h \ > $(srcdir)/profile.h prof_err.h $(top_srcdir)/lib/quota/quotaio.h \ > $(top_srcdir)/lib/quota/dqblk_v2.h $(top_srcdir)/lib/quota/quotaio_tree.h \ > $(top_srcdir)/lib/../e2fsck/dict.h $(srcdir)/problem.h > +readahead.o: $(srcdir)/readahead.c $(top_builddir)/lib/config.h \ > + $(top_srcdir)/lib/ext2fs/ext2fs.h $(top_srcdir)/lib/ext2fs/ext2_fs.h \ > + $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/e2fsck.h > diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h > index 8f16218..ead546e 100644 > --- a/e2fsck/e2fsck.h > +++ b/e2fsck/e2fsck.h > @@ -490,6 +490,23 @@ extern ext2_ino_t e2fsck_get_lost_and_found(e2fsck_t ctx, int fix); > extern errcode_t e2fsck_adjust_inode_count(e2fsck_t ctx, ext2_ino_t ino, > int adj); > > +/* readahead.c */ > +#define E2FSCK_READA_SUPER (0x01) > +#define E2FSCK_READA_GDT (0x02) > +#define E2FSCK_READA_BBITMAP (0x04) > +#define E2FSCK_READA_IBITMAP (0x08) > +#define E2FSCK_READA_ITABLE (0x10) > +#define E2FSCK_READA_ALL_FLAGS (0x1F) > +errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start, > + dgrp_t ngroups); > +#define E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT (0x01) > +#define E2FSCK_RA_DBLIST_ALL_FLAGS (0x01) > +errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags, > + ext2_dblist dblist, > + unsigned long long start, > + unsigned long long count); > +int e2fsck_can_readahead(ext2_filsys fs); > +unsigned long long e2fsck_guess_readahead(ext2_filsys fs); > > /* region.c */ > extern region_t region_create(region_addr_t min, region_addr_t max); > @@ -579,6 +596,7 @@ extern errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs, > int default_type, > const char *profile_name, > ext2fs_block_bitmap *ret); > +unsigned long long get_memory_size(void); > > /* unix.c */ > extern void e2fsck_clear_progbar(e2fsck_t ctx); > diff --git a/e2fsck/readahead.c b/e2fsck/readahead.c > new file mode 100644 > index 0000000..0395975 > --- /dev/null > +++ b/e2fsck/readahead.c > @@ -0,0 +1,213 @@ > +/* > + * readahead.c -- Prefetch filesystem metadata to speed up fsck. > + * > + * Copyright (C) 2014 Oracle. > + * > + * %Begin-Header% > + * This file may be redistributed under the terms of the GNU Library > + * General Public License, version 2. > + * %End-Header% > + */ > + > +#include "config.h" > +#include <string.h> > + > +#include "e2fsck.h" > + > +#undef DEBUG > + > +#ifdef DEBUG > +# define dbg_printf(f, a...) do {printf(f, ## a); fflush(stdout); } while (0) > +#else > +# define dbg_printf(f, a...) > +#endif > + > +struct read_dblist { > + errcode_t err; > + blk64_t run_start; > + blk64_t run_len; > + int flags; > +}; > + > +static EXT2_QSORT_TYPE readahead_dir_block_cmp(const void *a, const void *b) > +{ > + const struct ext2_db_entry2 *db_a = > + (const struct ext2_db_entry2 *) a; > + const struct ext2_db_entry2 *db_b = > + (const struct ext2_db_entry2 *) b; > + > + return (int) (db_a->blk - db_b->blk); > +} > + > +static int readahead_dir_block(ext2_filsys fs, struct ext2_db_entry2 *db, > + void *priv_data) > +{ > + struct read_dblist *pr = priv_data; > + e2_blkcnt_t count = (pr->flags & E2FSCK_RA_DBLIST_IGNORE_BLOCKCNT ? > + 1 : db->blockcnt); > + > + if (!pr->run_len || db->blk != pr->run_start + pr->run_len) { > + if (pr->run_len) { > + pr->err = io_channel_cache_readahead(fs->io, > + pr->run_start, > + pr->run_len); > + dbg_printf("readahead start=%llu len=%llu err=%d\n", > + pr->run_start, pr->run_len, > + (int)pr->err); > + } > + pr->run_start = db->blk; > + pr->run_len = 0; > + } > + pr->run_len += count; > + > + return pr->err ? DBLIST_ABORT : 0; > +} > + > +errcode_t e2fsck_readahead_dblist(ext2_filsys fs, int flags, > + ext2_dblist dblist, > + unsigned long long start, > + unsigned long long count) > +{ > + errcode_t err; > + struct read_dblist pr; > + > + dbg_printf("%s: flags=0x%x\n", __func__, flags); > + if (flags & ~E2FSCK_RA_DBLIST_ALL_FLAGS) > + return EXT2_ET_INVALID_ARGUMENT; > + > + memset(&pr, 0, sizeof(pr)); > + pr.flags = flags; > + err = ext2fs_dblist_iterate3(dblist, readahead_dir_block, start, > + count, &pr); > + if (pr.err) > + return pr.err; > + if (err) > + return err; > + > + if (pr.run_len) > + err = io_channel_cache_readahead(fs->io, pr.run_start, > + pr.run_len); > + > + return err; > +} > + > +errcode_t e2fsck_readahead(ext2_filsys fs, int flags, dgrp_t start, > + dgrp_t ngroups) > +{ > + blk64_t super, old_gdt, new_gdt; > + blk_t blocks; > + dgrp_t i; > + ext2_dblist dblist; > + dgrp_t end = start + ngroups; > + errcode_t err = 0; > + > + dbg_printf("%s: flags=0x%x start=%d groups=%d\n", __func__, flags, > + start, ngroups); > + if (flags & ~E2FSCK_READA_ALL_FLAGS) > + return EXT2_ET_INVALID_ARGUMENT; > + > + if (end > fs->group_desc_count) > + end = fs->group_desc_count; > + > + if (flags == 0) > + return 0; > + > + err = ext2fs_init_dblist(fs, &dblist); It turns out that each of the calls to ext2fs_resize_mem in the ext2fs_add_dir_block2() function is costing us ~2ms for each call to this function. I'll add a new ext2fs_init_dblist() APi that lets us specify the initial size of the list. This seems to reduce the fsck runtime by a few more seconds. --D > + if (err) > + return err; > + > + for (i = start; i < end; i++) { > + err = ext2fs_super_and_bgd_loc2(fs, i, &super, &old_gdt, > + &new_gdt, &blocks); > + if (err) > + break; > + > + if (flags & E2FSCK_READA_SUPER) { > + err = ext2fs_add_dir_block2(dblist, 0, super, 0); > + if (err) > + break; > + } > + > + if (flags & E2FSCK_READA_GDT) { > + if (old_gdt) > + err = ext2fs_add_dir_block2(dblist, 0, old_gdt, > + blocks); > + else if (new_gdt) > + err = ext2fs_add_dir_block2(dblist, 0, new_gdt, > + blocks); > + else > + err = 0; > + if (err) > + break; > + } > + > + if ((flags & E2FSCK_READA_BBITMAP) && > + !ext2fs_bg_flags_test(fs, i, EXT2_BG_BLOCK_UNINIT) && > + ext2fs_bg_free_blocks_count(fs, i) < > + fs->super->s_blocks_per_group) { > + super = ext2fs_block_bitmap_loc(fs, i); > + err = ext2fs_add_dir_block2(dblist, 0, super, 1); > + if (err) > + break; > + } > + > + if ((flags & E2FSCK_READA_IBITMAP) && > + !ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT) && > + ext2fs_bg_free_inodes_count(fs, i) < > + fs->super->s_inodes_per_group) { > + super = ext2fs_inode_bitmap_loc(fs, i); > + err = ext2fs_add_dir_block2(dblist, 0, super, 1); > + if (err) > + break; > + } > + > + if ((flags & E2FSCK_READA_ITABLE) && > + ext2fs_bg_free_inodes_count(fs, i) < > + fs->super->s_inodes_per_group) { > + super = ext2fs_inode_table_loc(fs, i); > + blocks = fs->inode_blocks_per_group - > + (ext2fs_bg_itable_unused(fs, i) * > + EXT2_INODE_SIZE(fs->super) / fs->blocksize); > + err = ext2fs_add_dir_block2(dblist, 0, super, blocks); > + if (err) > + break; > + } > + } > + > + if (!err) { > + ext2fs_dblist_sort2(dblist, readahead_dir_block_cmp); > + err = e2fsck_readahead_dblist(fs, 0, dblist, 0, > + ext2fs_dblist_count2(dblist)); > + } > + > + ext2fs_free_dblist(dblist); > + return err; > +} > + > +int e2fsck_can_readahead(ext2_filsys fs) > +{ > + errcode_t err; > + > + err = io_channel_cache_readahead(fs->io, 0, 1); > + dbg_printf("%s: supp=%d\n", __func__, err != EXT2_ET_OP_NOT_SUPPORTED); > + return err != EXT2_ET_OP_NOT_SUPPORTED; > +} > + > +unsigned long long e2fsck_guess_readahead(ext2_filsys fs) > +{ > + unsigned long long guess; > + > + /* > + * The optimal readahead sizes were experimentally determined by > + * djwong in August 2014. Setting the RA size to one block group's > + * worth of inode table blocks seems to yield the largest reductions > + * in e2fsck runtime. > + */ > + guess = fs->blocksize * fs->inode_blocks_per_group; > + > + /* Disable RA if it'd use more 1/100th of RAM. */ > + if (get_memory_size() > (guess * 100)) > + return guess / 1024; > + > + return 0; > +} > diff --git a/e2fsck/util.c b/e2fsck/util.c > index 8237328..74f20062 100644 > --- a/e2fsck/util.c > +++ b/e2fsck/util.c > @@ -37,6 +37,10 @@ > #include <errno.h> > #endif > > +#ifdef HAVE_SYS_SYSCTL_H > +#include <sys/sysctl.h> > +#endif > + > #include "e2fsck.h" > > extern e2fsck_t e2fsck_global_ctx; /* Try your very best not to use this! */ > @@ -848,3 +852,50 @@ errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs, const char *descr, > fs->default_bitmap_type = save_type; > return retval; > } > + > +/* Return memory size in bytes */ > +unsigned long long get_memory_size(void) > +{ > +#if defined(_SC_PHYS_PAGES) > +# if defined(_SC_PAGESIZE) > + return (unsigned long long)sysconf(_SC_PHYS_PAGES) * > + (unsigned long long)sysconf(_SC_PAGESIZE); > +# elif defined(_SC_PAGE_SIZE) > + return (unsigned long long)sysconf(_SC_PHYS_PAGES) * > + (unsigned long long)sysconf(_SC_PAGE_SIZE); > +# endif > +#elif defined(CTL_HW) > +# if (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) > +# define CTL_HW_INT64 > +# elif (defined(HW_PHYSMEM) || defined(HW_REALMEM)) > +# define CTL_HW_UINT > +# endif > + int mib[2]; > + > + mib[0] = CTL_HW; > +# if defined(HW_MEMSIZE) > + mib[1] = HW_MEMSIZE; > +# elif defined(HW_PHYSMEM64) > + mib[1] = HW_PHYSMEM64; > +# elif defined(HW_REALMEM) > + mib[1] = HW_REALMEM; > +# elif defined(HW_PYSMEM) > + mib[1] = HW_PHYSMEM; > +# endif > +# if defined(CTL_HW_INT64) > + unsigned long long size = 0; > +# elif defined(CTL_HW_UINT) > + unsigned int size = 0; > +# endif > +# if defined(CTL_HW_INT64) || defined(CTL_HW_UINT) > + size_t len = sizeof(size); > + > + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) > + return (unsigned long long)size; > +# endif > + return 0; > +#else > +# warning "Don't know how to detect memory on your platform?" > + return 0; > +#endif > +} > diff --git a/lib/config.h.in b/lib/config.h.in > index 1d2382b..b598d1e 100644 > --- a/lib/config.h.in > +++ b/lib/config.h.in > @@ -500,6 +500,9 @@ > /* Define to 1 if you have the <sys/syscall.h> header file. */ > #undef HAVE_SYS_SYSCALL_H > > +/* Define to 1 if you have the <sys/sysctl.h> header file. */ > +#undef HAVE_SYS_SYSCTL_H > + > /* Define to 1 if you have the <sys/sysmacros.h> header file. */ > #undef HAVE_SYS_SYSMACROS_H > > diff --git a/lib/ext2fs/dblist.c b/lib/ext2fs/dblist.c > index 942c4f0..bbdb221 100644 > --- a/lib/ext2fs/dblist.c > +++ b/lib/ext2fs/dblist.c > @@ -194,20 +194,25 @@ void ext2fs_dblist_sort2(ext2_dblist dblist, > /* > * This function iterates over the directory block list > */ > -errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist, > +errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist, > int (*func)(ext2_filsys fs, > struct ext2_db_entry2 *db_info, > void *priv_data), > + unsigned long long start, > + unsigned long long count, > void *priv_data) > { > - unsigned long long i; > + unsigned long long i, end; > int ret; > > EXT2_CHECK_MAGIC(dblist, EXT2_ET_MAGIC_DBLIST); > > + end = start + count; > if (!dblist->sorted) > ext2fs_dblist_sort2(dblist, 0); > - for (i=0; i < dblist->count; i++) { > + if (end > dblist->count) > + end = dblist->count; > + for (i = start; i < end; i++) { > ret = (*func)(dblist->fs, &dblist->list[i], priv_data); > if (ret & DBLIST_ABORT) > return 0; > @@ -215,6 +220,16 @@ errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist, > return 0; > } > > +errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist, > + int (*func)(ext2_filsys fs, > + struct ext2_db_entry2 *db_info, > + void *priv_data), > + void *priv_data) > +{ > + return ext2fs_dblist_iterate3(dblist, func, 0, dblist->count, > + priv_data); > +} > + > static EXT2_QSORT_TYPE dir_block_cmp2(const void *a, const void *b) > { > const struct ext2_db_entry2 *db_a = > diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h > index b4a9f84..04a1f4a 100644 > --- a/lib/ext2fs/ext2fs.h > +++ b/lib/ext2fs/ext2fs.h > @@ -1052,11 +1052,17 @@ extern void ext2fs_dblist_sort2(ext2_dblist dblist, > extern errcode_t ext2fs_dblist_iterate(ext2_dblist dblist, > int (*func)(ext2_filsys fs, struct ext2_db_entry *db_info, > void *priv_data), > - void *priv_data); > + void *priv_data); > extern errcode_t ext2fs_dblist_iterate2(ext2_dblist dblist, > int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info, > void *priv_data), > - void *priv_data); > + void *priv_data); > +extern errcode_t ext2fs_dblist_iterate3(ext2_dblist dblist, > + int (*func)(ext2_filsys fs, struct ext2_db_entry2 *db_info, > + void *priv_data), > + unsigned long long start, > + unsigned long long count, > + void *priv_data); > extern errcode_t ext2fs_set_dir_block(ext2_dblist dblist, ext2_ino_t ino, > blk_t blk, int blockcnt); > extern errcode_t ext2fs_set_dir_block2(ext2_dblist dblist, ext2_ino_t ino, > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html