[PATCH 2/2] libext2fs/e2fsck: implement metadata prefetching

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Use threads with our new mmap IO manager to prefetch metadata.  This
results in a major e2fsck run time speedup.  There's also a stupider
multiprocess version that works with the good old UNIX IO manager to
get pages into the page cache.

Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
---
 e2fsck/unix.c          |   13 +
 lib/ext2fs/Makefile.in |    8 +
 lib/ext2fs/ext2fs.h    |   13 +
 lib/ext2fs/prefetch.c  |  456 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 490 insertions(+)
 create mode 100644 lib/ext2fs/prefetch.c


diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index eeeef7c..33afc06 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1181,6 +1181,7 @@ int main (int argc, char *argv[])
 	__u32 features[3];
 	char *cp;
 	int qtype = -99;  /* quota type */
+	struct ext2fs_prefetch_handle *h = NULL;
 
 	clear_problem_context(&pctx);
 	sigcatcher_setup();
@@ -1638,9 +1639,21 @@ print_unsupp_features:
 		quota_init_context(&ctx->qctx, ctx->fs, qtype);
 	}
 
+	if (getenv("PREFETCH")) {
+		int flags = PREFETCH_INODES | PREFETCH_DIRS | PREFETCH_THREADED;
+		if (getenv("PREFETCH_WAIT"))
+			flags &= ~PREFETCH_THREADED;
+		retval = ext2fs_prefetch(fs, flags, &h);
+		if (retval)
+			com_err(ctx->program_name, retval, "prefetching");
+	}
+
 	run_result = e2fsck_run(ctx);
 	e2fsck_clear_progbar(ctx);
 
+	if (h)
+		ext2fs_prefetch_free(&h);
+
 	if (ctx->flags & E2F_FLAG_JOURNAL_INODE) {
 		if (fix_problem(ctx, PR_6_RECREATE_JOURNAL, &pctx)) {
 			if (journal_size < 1024)
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index a1b5a01..9fbf2b5 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -73,6 +73,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
 	native.o \
 	newdir.o \
 	openfs.o \
+	prefetch.o \
 	progress.o \
 	punch.o \
 	qcow2.o \
@@ -150,6 +151,7 @@ SRCS= ext2_err.c \
 	$(srcdir)/native.c \
 	$(srcdir)/newdir.c \
 	$(srcdir)/openfs.c \
+	$(srcdir)/prefetch.c \
 	$(srcdir)/progress.c \
 	$(srcdir)/punch.c \
 	$(srcdir)/qcow2.c \
@@ -863,6 +865,12 @@ openfs.o: $(srcdir)/openfs.c $(top_builddir)/lib/config.h \
  $(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
  $(srcdir)/ext2_io.h $(top_builddir)/lib/ext2fs/ext2_err.h \
  $(srcdir)/ext2_ext_attr.h $(srcdir)/bitops.h $(srcdir)/e2image.h
+prefetch.o: $(srcdir)/prefetch.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
+ $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h $(srcdir)/ext2_io.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/ext2_ext_attr.h \
+ $(srcdir)/bitops.h $(srcdir)/ext2fsP.h
 progress.o: $(srcdir)/progress.c $(top_builddir)/lib/config.h \
  $(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
  $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index ba5c388..e634d2c 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1522,6 +1522,19 @@ errcode_t ext2fs_mmp_update2(ext2_filsys fs, int immediately);
 errcode_t ext2fs_mmp_stop(ext2_filsys fs);
 unsigned ext2fs_mmp_new_seq(void);
 
+/* prefetch.c */
+#define PREFETCH_THREADED	(0x00000001)
+#define PREFETCH_ERROR_ABORT	(0x00000002)
+#define PREFETCH_BITMAPS	(0x00000004)
+#define PREFETCH_INODES		(0x00000008)
+#define PREFETCH_MAPS		(0x00000010)
+#define PREFETCH_DIRS		(0x00000020)
+struct ext2fs_prefetch_handle;
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+			  struct ext2fs_prefetch_handle **h);
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h);
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h);
+
 /* read_bb.c */
 extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
 				      ext2_badblocks_list *bb_list);
diff --git a/lib/ext2fs/prefetch.c b/lib/ext2fs/prefetch.c
new file mode 100644
index 0000000..022af41
--- /dev/null
+++ b/lib/ext2fs/prefetch.c
@@ -0,0 +1,456 @@
+/*
+ * prefetch.c --- Prefetch filesystem metadata.
+ *
+ * Copyright (C) 2014 by Oracle, Darrick J. Wong.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include "config.h"
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+#define USE_THREADS 1
+#define USE_SUPER 1
+
+struct ext2fs_prefetch_handle {
+	ext2_filsys fs;
+	int flags;
+	int done;
+	pid_t pid;
+#ifdef USE_THREADS
+	pthread_t tid;
+#endif
+};
+
+static int ignore_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+			blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+	return 0;
+}
+
+struct dirent_iterate {
+	void *buf;
+	int flags;
+};
+
+static int dirent_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+			blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+	struct dirent_iterate *di = priv_data;
+	errcode_t err;
+
+	err = io_channel_read_blk64(fs->io, *blocknr, 1, di->buf);
+	if (err && (di->flags & PREFETCH_ERROR_ABORT))
+		return BLOCK_ABORT;
+	return 0;
+}
+
+/*
+ * First dumb prefetch implementation: Separate process, just read data to
+ * get it into the page cache, at least.
+ */
+static void do_ext2fs_prefetch(ext2_filsys fs, int flags)
+{
+	void			*buf;
+	blk64_t			blk;
+	dgrp_t			i;
+	ext2_inode_scan		scan;
+	int			length = EXT2_INODE_SIZE(fs->super);
+	ext2_ino_t		ino;
+	errcode_t		err;
+	struct ext2_inode	inode;
+	struct dirent_iterate	di;
+	int			iter_flags;
+	unsigned int		blocks_to_read;
+
+	err = ext2fs_get_array(fs->blocksize, fs->inode_blocks_per_group, &buf);
+	if (err)
+		return;
+
+	/* load bitmaps */
+	if (!(flags & PREFETCH_BITMAPS))
+		goto skip_bitmaps;
+	err = ext2fs_read_bitmaps(fs);
+	if (err && (flags & PREFETCH_ERROR_ABORT))
+		goto out;
+
+skip_bitmaps:
+	/* load inode tables */
+	if (!(flags & PREFETCH_INODES) || (flags & (PREFETCH_MAPS |
+						    PREFETCH_DIRS)))
+		goto skip_itable;
+
+	for (i = 0; i < fs->group_desc_count; i++) {
+		if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+			continue;
+
+		blocks_to_read = fs->inode_blocks_per_group;
+		if (ext2fs_has_group_desc_csum(fs)) {
+			unsigned int num_inodes =
+					fs->super->s_inodes_per_group -
+					ext2fs_bg_itable_unused(fs, i);
+			blocks_to_read = (num_inodes *
+					  EXT2_INODE_SIZE(fs->super)) /
+					 fs->blocksize;
+		}
+
+		blk = ext2fs_inode_table_loc(fs, i);
+		err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+		if (err && (flags & PREFETCH_ERROR_ABORT))
+			goto out;
+	}
+
+skip_itable:
+	/* load inodes */
+	if (!(flags & (PREFETCH_MAPS | PREFETCH_DIRS)))
+		goto skip_inodes;
+
+	err = ext2fs_open_inode_scan(fs, 0, &scan);
+	if (err && (flags & PREFETCH_ERROR_ABORT))
+		goto out;
+
+	di.buf = buf;
+	di.flags = flags;
+	do {
+		err = ext2fs_get_next_inode_full(scan, &ino, &inode,
+						 sizeof(inode));
+		if (err)
+			break;
+		if (!ino)
+			break;
+		if (!ext2fs_test_inode_bitmap2(fs->inode_map, ino))
+			continue;
+
+		iter_flags = BLOCK_FLAG_READ_ONLY | BLOCK_FLAG_DATA_ONLY;
+		if ((flags & PREFETCH_MAPS) &&
+		    !(flags & PREFETCH_DIRS) &&
+		    (LINUX_S_ISREG(inode.i_mode) ||
+		     LINUX_S_ISLNK(inode.i_mode))) {
+			err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+						    ignore_block, &di);
+		} else if ((flags & PREFETCH_DIRS) &&
+			   !(flags & PREFETCH_MAPS) &&
+			   LINUX_S_ISDIR(inode.i_mode)) {
+			err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+						    dirent_block, &di);
+		} else {
+			int (*func)(ext2_filsys fs, blk64_t *blocknr,
+				    e2_blkcnt_t blockcnt, blk64_t ref_blk,
+				    int ref_offset, void *priv_data) =
+			LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+						ignore_block;
+			err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+						    func, &di);
+		}
+		if (err && (flags & PREFETCH_ERROR_ABORT))
+			break;
+
+		blk = ext2fs_file_acl_block(fs, &inode);
+		if (!blk)
+			continue;
+		err = io_channel_read_blk64(fs->io, blk, 1, buf);
+		if (err && (flags & PREFETCH_ERROR_ABORT))
+			break;
+	} while (ino);
+
+out2:
+	ext2fs_close_inode_scan(scan);
+
+skip_inodes:
+out:
+	ext2fs_free_mem(&buf);
+
+	return;
+}
+
+static void *prefetch_thread(void *data)
+{
+	struct ext2fs_prefetch_handle *pd = data;
+	do_ext2fs_prefetch(pd->fs, pd->flags);
+	return NULL;
+}
+
+/*
+ * Second, less dumb prefetch: Use threads to preload metadata in group order.
+ */
+struct super_entry {
+	dgrp_t group;
+	ext2_ino_t num_inodes;
+};
+
+struct super_thread {
+	ext2_filsys fs;
+	pthread_t tid;
+	int flags;
+	struct super_entry *start, *end;
+	unsigned int skip_factor;
+	void *buf;
+};
+
+static void *super_func(void *data)
+{
+	struct super_thread *t = data;
+	ext2_filsys fs = t->fs;
+	int flags = t->flags;
+	void *buf = t->buf;
+	struct super_entry *e;
+	unsigned int blocks_to_read;
+	blk64_t blk;
+	ext2_ino_t i, ino;
+	unsigned int nr_read = 0;
+	struct dirent_iterate di;
+	struct ext2_inode inode;
+	int iter_flags;
+	errcode_t err;
+
+	/* Read the inode tables */
+	for (e = t->start; e < t->end; e += t->skip_factor) {
+		blocks_to_read = (e->num_inodes *
+				  EXT2_INODE_SIZE(fs->super)) / fs->blocksize;
+		blk = ext2fs_inode_table_loc(fs, e->group);
+		err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+		if (err && (flags & PREFETCH_ERROR_ABORT))
+			continue;
+	}
+
+	/* Scan inodes for extent/dir blocks */
+	di.buf = buf;
+	di.flags = flags;
+	for (e = t->start; e < t->end; e += t->skip_factor) {
+		for (i = 0; i < e->num_inodes; i++) {
+			ino = e->group * fs->super->s_inodes_per_group + i;
+			err = ext2fs_read_inode(fs, ino, &inode);
+			if (err)
+				continue;
+			/* Skip unlinked or unknown-type inodes */
+			if (!inode.i_links_count ||
+			    (inode.i_mode & 0xF000) == 0)
+				continue;
+
+			iter_flags = BLOCK_FLAG_READ_ONLY |
+				     BLOCK_FLAG_DATA_ONLY;
+			if ((flags & PREFETCH_MAPS) &&
+			    !(flags & PREFETCH_DIRS) &&
+			    (LINUX_S_ISREG(inode.i_mode) ||
+			     LINUX_S_ISLNK(inode.i_mode))) {
+				err = ext2fs_block_iterate3(fs, ino,
+							    iter_flags, NULL,
+							    ignore_block, &di);
+			} else if ((flags & PREFETCH_DIRS) &&
+				   !(flags & PREFETCH_MAPS) &&
+				   LINUX_S_ISDIR(inode.i_mode)) {
+				err = ext2fs_block_iterate3(fs, ino,
+							    iter_flags, NULL,
+							    dirent_block, &di);
+			} else {
+				int (*func)(ext2_filsys fs, blk64_t *blocknr,
+					    e2_blkcnt_t blockcnt,
+					    blk64_t ref_blk,
+					    int ref_offset, void *priv_data) =
+				LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+							ignore_block;
+				err = ext2fs_block_iterate3(fs, ino,
+							    iter_flags, NULL,
+							    func, &di);
+			}
+
+			blk = ext2fs_file_acl_block(fs, &inode);
+			if (!blk)
+				continue;
+			err = io_channel_read_blk64(fs->io, blk, 1, buf);
+		}
+	}
+
+	return NULL;
+}
+
+static void *super_prefetch(void *data)
+{
+	struct ext2fs_prefetch_handle *pd = data;
+	ext2_filsys fs = pd->fs;
+	int flags = pd->flags;
+	void *b, *r;
+	struct super_thread *threads = NULL, *t;
+	unsigned int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+	unsigned int j;
+	struct super_entry *entries = NULL, *e = NULL;
+	unsigned int num_entries = 0;
+	dgrp_t i;
+	ext2_ino_t num_inodes;
+	errcode_t err;
+
+	/* Find all non-empty groups */
+	for (i = 0; i < fs->group_desc_count; i++) {
+		if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+			continue;
+
+		num_inodes = fs->super->s_inodes_per_group;
+		if (ext2fs_bg_free_inodes_count(fs, i) == num_inodes)
+			continue;
+		if (ext2fs_has_group_desc_csum(fs))
+			num_inodes -= ext2fs_bg_itable_unused(fs, i);
+		if (e == entries + num_entries) {
+			r = realloc(entries, (num_entries + 32) *
+					     sizeof(*entries));
+			if (r == NULL) {
+				err = errno;
+				goto out;
+			}
+			entries = r;
+			e = entries + num_entries;
+			num_entries += 32;
+		}
+		e->group = i;
+		e->num_inodes = num_inodes;
+		e++;
+	}
+	num_entries = e - entries;
+
+	/* Set up the threads */
+	if (getenv("PREFETCH_THREADS")) {
+		j = atoi(getenv("PREFETCH_THREADS"));
+		if (j > 0)
+			num_threads = j;
+	}
+
+	err = ext2fs_get_arrayzero(num_threads, sizeof(*threads) +
+				(fs->blocksize * fs->inode_blocks_per_group),
+				&b);
+	if (err)
+		goto out;
+	threads = b + (fs->blocksize * fs->inode_blocks_per_group *
+		       num_threads);
+
+	for (j = 0, t = threads, e = entries; j < num_threads; j++, t++, e++) {
+		t->fs = fs;
+		t->flags = flags;
+		t->start = e;
+		t->end = entries + num_entries;
+		t->skip_factor = num_threads;
+		err = ext2fs_dup_handle(fs, &t->fs);
+		if (err)
+			goto out2;
+		t->fs->icache = NULL;
+		t->buf = b + (fs->blocksize * fs->inode_blocks_per_group * j);
+		pthread_create(&t->tid, NULL, super_func, t);
+	}
+
+	/* Wait for threads */
+	for (j = 0, t = threads; j < num_threads; j++, t++)
+		pthread_join(t->tid, NULL);
+	pd->done = 1;
+out2:
+	ext2fs_free_mem(&b);
+out:
+	free(entries);
+	return NULL;
+}
+
+struct unix_private_data_hack {
+	int	magic;
+	int	dev;
+};
+
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+			  struct ext2fs_prefetch_handle **h)
+{
+	struct ext2fs_prefetch_handle *pd;
+	errcode_t err;
+
+	err = ext2fs_get_memzero(sizeof(*pd), &pd);
+	if (err)
+		return err;
+	pd->fs = fs;
+	pd->flags = flags;
+
+	/* Load the rest */
+	if (flags & PREFETCH_THREADED) {
+		if (fs->io->manager == mmap_io_manager) {
+#if USE_SUPER
+			struct timespec ts;
+			err = pthread_create(&pd->tid, NULL, super_prefetch,
+					     pd);
+			if (err)
+				goto errout;
+			ts.tv_sec = 0; ts.tv_nsec = 500000;
+			nanosleep(&ts, NULL);
+#elif USE_THREADS
+			err = pthread_create(&pd->tid, NULL, prefetch_thread,
+					     pd);
+			if (err)
+				goto errout;
+#else
+			goto single_thread;
+#endif
+		} else if (fs->io->manager == unix_io_manager) {
+			pd->pid = fork();
+			if (pd->pid < 0) {
+				err = errno;
+				goto errout;
+			} else if (pd->pid == 0) {
+				struct unix_private_data_hack *m =
+						fs->io->private_data;
+				m->dev = open(fs->device_name, O_RDONLY);
+				do_ext2fs_prefetch(fs, flags);
+				exit(0);
+			}
+		}
+	} else {
+single_thread:
+#if USE_SUPER
+		super_prefetch(pd);
+#else
+		do_ext2fs_prefetch(fs, flags);
+#endif
+		pd->done = 1;
+	}
+	*h = pd;
+
+	return 0;
+errout:
+	ext2fs_free_mem(&pd);
+	return err;
+}
+
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h)
+{
+	pid_t ret;
+	int status;
+
+	if (h->flags & PREFETCH_THREADED && h->done != 0) {
+		if (h->tid)
+			ret = pthread_join(h->tid, NULL);
+		if (h->pid) {
+			ret = waitpid(h->pid, &status, WNOHANG);
+			if (ret == 0)
+				kill(h->pid, SIGKILL);
+			waitpid(h->pid, NULL, 0);
+		}
+	}
+	h->done = 1;
+	return 0;
+}
+
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h)
+{
+	ext2fs_prefetch_wait(*h);
+	return ext2fs_free_mem(h);
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux