>From 40c3dac03ac40d03d987b2b1385ab3e68277067b Mon Sep 17 00:00:00 2001 From: Adrian Hunter <adrian.hunter@xxxxxxxxx> Date: Fri, 3 Jul 2009 15:25:13 +0300 Subject: [PATCH] HACK: ext3: mount fast even when recovering Speed up ext3 recovery mount time by not sync'ing the block device. Instead place all dirty buffers into the I/O queue and add a write barrier. This ensures that no subsequent write will reach the disk before all the recovery writes, but that we do not have to wait for the I/O. Note that ext3 reads sectors the correct way: through the buffer cache, so there is no risk of reading old metadata. Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx> --- fs/ext3/super.c | 66 ++++++++++++++++++++++++++++++++++++++++++---- fs/jbd/journal.c | 23 ++++++++++++---- fs/jbd/recovery.c | 19 +++++++++++++- include/linux/ext3_fs.h | 1 + include/linux/jbd.h | 1 + 5 files changed, 97 insertions(+), 13 deletions(-) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f4be66e..59efefb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1263,7 +1263,13 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ext3_update_dynamic_rev(sb); EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, es, 1); + /* + * If we are in a hurry, we do not need to wait for the super block to + * reach the disk. We just need to make sure that all previous writes + * arrive before it. Setting the sync parameter to 2 will cause a + * write barrier to be added but will not wait for the I/O to complete. + */ + ext3_commit_super(sb, es, test_opt(sb, FAST) ? 2 : 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", @@ -1622,6 +1628,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); + /* + * Set an option to indicate that we want to mount fast even + * when recovering. That is achieved by not sync'ing the + * block device, but instead placing all dirty buffers into + * the I/O queue and adding a write barrier. + */ + set_opt(sbi->s_mount_opt, FAST); + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, NULL, 0)) goto failed_mount; @@ -2007,6 +2021,12 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; + /* + * Tell the journal about our fast mounting scheme, so it can + * play its part. + */ + if (test_opt(sb, FAST)) + journal->j_flags |= JFS_LOAD_FAST; spin_unlock(&journal->j_state_lock); } @@ -2224,7 +2244,13 @@ static int ext3_load_journal(struct super_block *sb, mark_sb_dirty(sb); /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); + /* + * The super gets committed later by 'ext3_setup_super()' + * or 'ext3_maek_recovery_complete()' anyway, so if we are + * in a hurry we can skip it here. + */ + if (!test_opt(sb, FAST)) + ext3_commit_super(sb, es, 1); } return 0; @@ -2285,7 +2311,16 @@ static void ext3_commit_super (struct super_block * sb, es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - if (sync) + if (sync == 2) { + /* + * Caller has requested that a barrier is used, so that this + * write will not reach the disk before any previous ones, + * and we will not have to wait for it either. + */ + set_buffer_ordered(sbh); + ll_rw_block(SWRITE, 1, &sbh); + clear_buffer_ordered(sbh); + } else if (sync) sync_dirty_buffer(sbh); } @@ -2301,15 +2336,29 @@ static void ext3_mark_recovery_complete(struct super_block * sb, journal_t *journal = EXT3_SB(sb)->s_journal; journal_lock_updates(journal); - if (journal_flush(journal) < 0) + + /* + * There is no need to flush the journal so skip it if we are in a + * hurry. + */ + if (!test_opt(sb, FAST) && journal_flush(journal) < 0) goto out; lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { + /* + * If we are in a hurry, we do not need to wait for the super + * block to reach the disk. We just need to make sure that + * all previous writes arrive before it. Setting the sync + * parameter to 2 will cause a write barrier to be added but + * will not wait for the I/O to complete. + */ + int sync = test_opt(sb, FAST) ? 2 : 1; + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); mark_sb_clean(sb); - ext3_commit_super(sb, es, 1); + ext3_commit_super(sb, es, sync); } unlock_super(sb); @@ -2348,7 +2397,12 @@ static void ext3_clear_journal_err(struct super_block * sb, EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; es->s_state |= cpu_to_le16(EXT3_ERROR_FS); - ext3_commit_super (sb, es, 1); + /* + * The super gets committed later by 'ext3_setup_super()' + * anyway, so if we are in a hurry we can skip it here. + */ + if (!test_opt(sb, FAST)) + ext3_commit_super (sb, es, 1); journal_clear_err(journal); } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9e4fa52..3fd14ef 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -822,7 +822,7 @@ static void journal_fail_superblock (journal_t *journal) * subsequent use. */ -static int journal_reset(journal_t *journal) +static int journal_reset(journal_t *journal, int wait) { journal_superblock_t *sb = journal->j_superblock; unsigned long first, last; @@ -844,7 +844,7 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + journal_update_superblock(journal, wait); return journal_start_thread(journal); } @@ -916,13 +916,14 @@ int journal_create(journal_t *journal) journal->j_flags &= ~JFS_ABORT; journal->j_format_version = 2; - return journal_reset(journal); + return journal_reset(journal, 1); } /** * void journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. * @wait: Set to '0' if you don't want to wait for IO completion. + * Note that a write barrier is used in that case. * * Update a journal's dynamic superblock fields and write it to disk, * optionally waiting for the IO to complete. @@ -961,8 +962,11 @@ void journal_update_superblock(journal_t *journal, int wait) mark_buffer_dirty(bh); if (wait) sync_dirty_buffer(bh); - else + else { + set_buffer_ordered(bh); ll_rw_block(SWRITE, 1, &bh); + clear_buffer_ordered(bh); + } out: /* If we have just flushed the log (by marking s_start==0), then @@ -1073,7 +1077,7 @@ static int load_superblock(journal_t *journal) */ int journal_load(journal_t *journal) { - int err; + int err, wait; journal_superblock_t *sb; err = load_superblock(journal); @@ -1103,7 +1107,14 @@ int journal_load(journal_t *journal) /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ - if (journal_reset(journal)) + /* + * If we are in a hurry, tell the reset not to wait, which will + * cause the journal superblock buffer to be placed into the I/O + * queue with a barrier, but we will not wait for the I/O to + * complete. + */ + wait = journal->j_flags & JFS_LOAD_FAST ? 0 : 1; + if (journal_reset(journal, wait)) goto recovery_error; journal->j_flags &= ~JFS_ABORT; diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index db5e982..a245c36 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -261,7 +261,24 @@ int journal_recover(journal_t *journal) journal->j_transaction_sequence = ++info.end_transaction; journal_clear_revoke(journal); - err2 = sync_blockdev(journal->j_fs_dev); + /* + * We can massively speed-up the recovery mount time by avoiding + * synchronizing the block device. Instead, we just throw all the + * dirty buffers into the I/O queue, and rely on callers to add + * a write barrier. + */ + if (journal->j_flags & JFS_LOAD_FAST) { + struct block_device *bdev = journal->j_fs_dev; + + err2 = 0; + if (bdev) { + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages) + err2 = filemap_fdatawrite(mapping); + } + } else + err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index d14f029..117e7a1 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -382,6 +382,7 @@ struct ext3_inode { #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ #define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write * error in ordered mode */ +#define EXT3_MOUNT_FAST 0x800000 /* Do not sync during recovery */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 346e2b8..06459ca 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -819,6 +819,7 @@ struct journal_s #define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ +#define JFS_LOAD_FAST 0x080 /* Do not sync during recovery */ /* * Function declarations for the journaling transaction and buffer -- 1.5.6.3 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html