On Wed, May 14, 2008 at 01:50:43PM +0900, Hidehiro Kawai wrote: > Subject: [PATCH 4/4] jbd: fix error handling for checkpoint io > > When a checkpointing IO fails, current JBD code doesn't check the > error and continue journaling. This means latest metadata can be > lost from both the journal and filesystem. > > This patch leaves the failed metadata blocks in the journal space > and aborts journaling in the case of log_do_checkpoint(). > To achieve this, we need to do: > > 1. don't remove the failed buffer from the checkpoint list where in > the case of __try_to_free_cp_buf() because it may be released or > overwritten by a later transaction > 2. log_do_checkpoint() is the last chance, remove the failed buffer > from the checkpoint list and abort journaling > 3. when checkpointing fails, don't update the journal super block to > prevent the journalled contents from being cleaned > 4. when checkpointing fails, don't clear the needs_recovery flag, > otherwise the journalled contents are ignored and cleaned in the > recovery phase > 5. if the recovery fails, keep the needs_recovery flag > > Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@xxxxxxxxxxx> > --- > fs/ext3/ioctl.c | 12 ++++---- > fs/ext3/super.c | 13 +++++++-- > fs/jbd/checkpoint.c | 59 ++++++++++++++++++++++++++++++++++-------- > fs/jbd/journal.c | 21 +++++++++----- > fs/jbd/recovery.c | 7 +++- > include/linux/jbd.h | 7 ++++ > 6 files changed, 91 insertions(+), 28 deletions(-) > > Index: linux-2.6.26-rc2/fs/jbd/checkpoint.c > =================================================================== > --- linux-2.6.26-rc2.orig/fs/jbd/checkpoint.c > +++ linux-2.6.26-rc2/fs/jbd/checkpoint.c > @@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct j > int ret = 0; > struct buffer_head *bh = jh2bh(jh); > > - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { > + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && > + !buffer_dirty(bh) && buffer_uptodate(bh)) { > JBUFFER_TRACE(jh, "remove from checkpoint list"); > ret = __journal_remove_checkpoint(jh) + 1; > jbd_unlock_bh_state(bh); > @@ -140,6 +141,25 @@ void __log_wait_for_space(journal_t *jou > } > > /* > + * We couldn't write back some metadata buffers to the filesystem. > + * To avoid unwritten-back metadata buffers from losing, set > + * JFS_CP_ABORT flag and make sure that the journal space isn't > + * cleaned. This function also aborts journaling. > + */ > +static void __journal_abort_checkpoint(journal_t *journal, int errno) > +{ > + if (is_checkpoint_aborted(journal)) > + return; > + > + spin_lock(&journal->j_state_lock); > + journal->j_flags |= JFS_CP_ABORT; > + spin_unlock(&journal->j_state_lock); > + printk(KERN_WARNING "JBD: Checkpointing failed. Some metadata blocks " > + "are still old.\n"); > + journal_abort(journal, errno); > +} > + > +/* > * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. > * The caller must restart a list walk. Wait for someone else to run > * jbd_unlock_bh_state(). > @@ -160,21 +180,25 @@ static void jbd_sync_bh(journal_t *journ > * buffers. Note that we take the buffers in the opposite ordering > * from the one in which they were submitted for IO. > * > + * Return 0 on success, and return <0 if some buffers have failed > + * to be written out. > + * > * Called with j_list_lock held. > */ > -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) > +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) > { > struct journal_head *jh; > struct buffer_head *bh; > tid_t this_tid; > int released = 0; > + int ret = 0; > > this_tid = transaction->t_tid; > restart: > /* Did somebody clean up the transaction in the meanwhile? */ > if (journal->j_checkpoint_transactions != transaction || > transaction->t_tid != this_tid) > - return; > + return ret; > while (!released && transaction->t_checkpoint_io_list) { > jh = transaction->t_checkpoint_io_list; > bh = jh2bh(jh); > @@ -194,6 +218,9 @@ restart: > spin_lock(&journal->j_list_lock); > goto restart; > } > + if (unlikely(!buffer_uptodate(bh))) > + ret = -EIO; > + > /* > * Now in whatever state the buffer currently is, we know that > * it has been written out and so we can drop it from the list > @@ -203,6 +230,8 @@ restart: > journal_remove_journal_head(bh); > __brelse(bh); > } > + > + return ret; > } > > #define NR_BATCH 64 > @@ -226,7 +255,8 @@ __flush_batch(journal_t *journal, struct > * Try to flush one buffer from the checkpoint list to disk. > * > * Return 1 if something happened which requires us to abort the current > - * scan of the checkpoint list. > + * scan of the checkpoint list. Return <0 if the buffer has failed to > + * be written out. > * > * Called with j_list_lock held and drops it if 1 is returned > * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it > @@ -256,6 +286,9 @@ static int __process_buffer(journal_t *j > log_wait_commit(journal, tid); > ret = 1; > } else if (!buffer_dirty(bh)) { > + ret = 1; > + if (unlikely(!buffer_uptodate(bh))) > + ret = -EIO; > J_ASSERT_JH(jh, !buffer_jbddirty(bh)); > BUFFER_TRACE(bh, "remove from checkpoint"); > __journal_remove_checkpoint(jh); > @@ -263,7 +296,6 @@ static int __process_buffer(journal_t *j > jbd_unlock_bh_state(bh); > journal_remove_journal_head(bh); > __brelse(bh); > - ret = 1; > } else { > /* > * Important: we are about to write the buffer, and > @@ -318,6 +350,7 @@ int log_do_checkpoint(journal_t *journal > * OK, we need to start writing disk blocks. Take one transaction > * and write it. > */ > + result = 0; > spin_lock(&journal->j_list_lock); > if (!journal->j_checkpoint_transactions) > goto out; > @@ -334,7 +367,7 @@ restart: > int batch_count = 0; > struct buffer_head *bhs[NR_BATCH]; > struct journal_head *jh; > - int retry = 0; > + int retry = 0, err; > > while (!retry && transaction->t_checkpoint_list) { > struct buffer_head *bh; > @@ -347,6 +380,8 @@ restart: > break; > } > retry = __process_buffer(journal, jh, bhs,&batch_count); > + if (retry < 0) > + result = retry; > if (!retry && (need_resched() || > spin_needbreak(&journal->j_list_lock))) { > spin_unlock(&journal->j_list_lock); > @@ -371,14 +406,18 @@ restart: > * Now we have cleaned up the first transaction's checkpoint > * list. Let's clean up the second one > */ > - __wait_cp_io(journal, transaction); > + err = __wait_cp_io(journal, transaction); > + if (!result) > + result = err; > } > out: > spin_unlock(&journal->j_list_lock); > - result = cleanup_journal_tail(journal); > if (result < 0) > - return result; > - return 0; > + __journal_abort_checkpoint(journal, result); > + else > + result = cleanup_journal_tail(journal); > + > + return (result < 0) ? result : 0; > } > > /* > Index: linux-2.6.26-rc2/include/linux/jbd.h > =================================================================== > --- linux-2.6.26-rc2.orig/include/linux/jbd.h > +++ linux-2.6.26-rc2/include/linux/jbd.h > @@ -816,6 +816,8 @@ struct journal_s > #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ > #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ > #define JFS_BARRIER 0x020 /* Use IDE barriers */ > +#define JFS_CP_ABORT 0x040 /* Checkpointing has failed. We don't have to > + * clean the journal space. */ > > /* > * Function declarations for the journaling transaction and buffer > @@ -1004,6 +1006,11 @@ static inline int is_journal_aborted(jou > return journal->j_flags & JFS_ABORT; > } > > +static inline int is_checkpoint_aborted(journal_t *journal) > +{ > + return journal->j_flags & JFS_CP_ABORT; > +} > + > static inline int is_handle_aborted(handle_t *handle) > { > if (handle->h_aborted) > Index: linux-2.6.26-rc2/fs/ext3/ioctl.c > =================================================================== > --- linux-2.6.26-rc2.orig/fs/ext3/ioctl.c > +++ linux-2.6.26-rc2/fs/ext3/ioctl.c > @@ -239,7 +239,7 @@ setrsvsz_out: > case EXT3_IOC_GROUP_EXTEND: { > ext3_fsblk_t n_blocks_count; > struct super_block *sb = inode->i_sb; > - int err; > + int err, err2; > > if (!capable(CAP_SYS_RESOURCE)) > return -EPERM; > @@ -254,16 +254,16 @@ setrsvsz_out: > } > err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); > journal_lock_updates(EXT3_SB(sb)->s_journal); > - journal_flush(EXT3_SB(sb)->s_journal); > + err2 = journal_flush(EXT3_SB(sb)->s_journal); > journal_unlock_updates(EXT3_SB(sb)->s_journal); > group_extend_out: > mnt_drop_write(filp->f_path.mnt); > - return err; > + return (err) ? err : err2; > } > case EXT3_IOC_GROUP_ADD: { > struct ext3_new_group_data input; > struct super_block *sb = inode->i_sb; > - int err; > + int err, err2; > > if (!capable(CAP_SYS_RESOURCE)) > return -EPERM; > @@ -280,11 +280,11 @@ group_extend_out: > > err = ext3_group_add(sb, &input); > journal_lock_updates(EXT3_SB(sb)->s_journal); > - journal_flush(EXT3_SB(sb)->s_journal); > + err2 = journal_flush(EXT3_SB(sb)->s_journal); > journal_unlock_updates(EXT3_SB(sb)->s_journal); > group_add_out: > mnt_drop_write(filp->f_path.mnt); > - return err; > + return (err) ? err : err2; > } > > > Index: linux-2.6.26-rc2/fs/ext3/super.c > =================================================================== > --- linux-2.6.26-rc2.orig/fs/ext3/super.c > +++ linux-2.6.26-rc2/fs/ext3/super.c > @@ -395,7 +395,10 @@ static void ext3_put_super (struct super > ext3_xattr_put_super(sb); > journal_destroy(sbi->s_journal); > if (!(sb->s_flags & MS_RDONLY)) { > - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > + if (!is_checkpoint_aborted(sbi->s_journal)) { > + EXT3_CLEAR_INCOMPAT_FEATURE(sb, > + EXT3_FEATURE_INCOMPAT_RECOVER); > + } > es->s_state = cpu_to_le16(sbi->s_mount_state); > BUFFER_TRACE(sbi->s_sbh, "marking dirty"); > mark_buffer_dirty(sbi->s_sbh); Is this bit here really needed? If we abort the journal the fs will be mounted read only and we should never get in here. Is there a case where we could abort the journal and not be flipped to being read-only? If there is such a case I would think that we should fix that by making the fs read-only, and not have this check. > @@ -2373,7 +2376,13 @@ static void ext3_write_super_lockfs(stru > > /* Now we set up the journal barrier. */ > journal_lock_updates(journal); > - journal_flush(journal); > + > + /* > + * We don't want to clear needs_recovery flag when we failed > + * to flush the journal. > + */ > + if (journal_flush(journal) < 0) > + return; > > /* Journal blocked and flushed, clear needs_recovery flag. */ > EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > Index: linux-2.6.26-rc2/fs/jbd/journal.c > =================================================================== > --- linux-2.6.26-rc2.orig/fs/jbd/journal.c > +++ linux-2.6.26-rc2/fs/jbd/journal.c > @@ -674,7 +674,7 @@ static journal_t * journal_init_common ( > journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); > > /* The journal is marked for error until we succeed with recovery! */ > - journal->j_flags = JFS_ABORT; > + journal->j_flags = JFS_ABORT | JFS_CP_ABORT; > > /* Set up a default-sized revoke table for the new mount. */ > err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); > @@ -1107,7 +1107,7 @@ int journal_load(journal_t *journal) > if (journal_reset(journal)) > goto recovery_error; > > - journal->j_flags &= ~JFS_ABORT; > + journal->j_flags &= ~(JFS_ABORT | JFS_CP_ABORT); > journal->j_flags |= JFS_LOADED; > return 0; > > @@ -1151,7 +1151,8 @@ void journal_destroy(journal_t *journal) > journal->j_tail = 0; > journal->j_tail_sequence = ++journal->j_transaction_sequence; > if (journal->j_sb_buffer) { > - journal_update_superblock(journal, 1); > + if (!is_checkpoint_aborted(journal)) > + journal_update_superblock(journal, 1); > brelse(journal->j_sb_buffer); > } > > @@ -1333,7 +1334,6 @@ static int journal_convert_superblock_v1 > > int journal_flush(journal_t *journal) > { > - int err = 0; > transaction_t *transaction = NULL; > unsigned long old_tail; > > @@ -1356,14 +1356,19 @@ int journal_flush(journal_t *journal) > spin_unlock(&journal->j_state_lock); > } > > - /* ...and flush everything in the log out to disk. */ > + /* ...and flush everything in the log out to disk. > + * Even if an error occurs, we don't stop this loop. > + * We do checkpoint as much as possible. */ > spin_lock(&journal->j_list_lock); > - while (!err && journal->j_checkpoint_transactions != NULL) { > + while (journal->j_checkpoint_transactions != NULL) { > spin_unlock(&journal->j_list_lock); > - err = log_do_checkpoint(journal); > + log_do_checkpoint(journal); > spin_lock(&journal->j_list_lock); > } > spin_unlock(&journal->j_list_lock); > + if (is_checkpoint_aborted(journal)) > + return -EIO; > + > cleanup_journal_tail(journal); > > /* Finally, mark the journal as really needing no recovery. > @@ -1385,7 +1390,7 @@ int journal_flush(journal_t *journal) > J_ASSERT(journal->j_head == journal->j_tail); > J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); > spin_unlock(&journal->j_state_lock); > - return err; > + return 0; > } > > /** > Index: linux-2.6.26-rc2/fs/jbd/recovery.c > =================================================================== > --- linux-2.6.26-rc2.orig/fs/jbd/recovery.c > +++ linux-2.6.26-rc2/fs/jbd/recovery.c > @@ -223,7 +223,7 @@ do { \ > */ > int journal_recover(journal_t *journal) > { > - int err; > + int err, err2; > journal_superblock_t * sb; > > struct recovery_info info; > @@ -261,7 +261,10 @@ int journal_recover(journal_t *journal) > journal->j_transaction_sequence = ++info.end_transaction; > > journal_clear_revoke(journal); > - sync_blockdev(journal->j_fs_dev); > + err2 = sync_blockdev(journal->j_fs_dev); > + if (!err) > + err = err2; > + > return err; > } > > > Other than that one question it looks good to me. Thanks much, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html