On Mon 15-06-15 14:27:09, Joseph Qi wrote: > If updating journal superblock fails after journal data has been flushed, > the error is omitted and this will mislead the caller as a normal case. > In ocfs2, the checkpoint will be treated successfully and the other node > can get the lock to update. Since the sb_start is still pointing to the > old log block, it will rewrite the journal data during journal recovery > by the other node. Thus the new updates will be overwritten and ocfs2 > corrupts. > So in above case we have to return the error, and ocfs2_commit_cache will > take care of the error and prevent the other node to do update first. > And only after recovering journal it can do the new updates. > > The issue discussion mail can be found at: > https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html > http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 > > Reported-by: Yiwen Jiang <jiangyiwen@xxxxxxxxxx> > Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxx> > Tested-by: Yiwen Jiang <jiangyiwen@xxxxxxxxxx> > Cc: Junxiao Bi <junxiao.bi@xxxxxxxxxx> > Cc: <stable@xxxxxxxxxxxxxxx> The patch looks good but it seems to be against relatively old kernel version. Can you please rebase your patch against current kernel? Thanks! Honza > --- > fs/jbd2/checkpoint.c | 5 ++--- > fs/jbd2/journal.c | 37 ++++++++++++++++++++++++++++++------- > include/linux/jbd2.h | 4 ++-- > 3 files changed, 34 insertions(+), 12 deletions(-) > > diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c > index 988b32e..82e5b7d 100644 > --- a/fs/jbd2/checkpoint.c > +++ b/fs/jbd2/checkpoint.c > @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) > unsigned long blocknr; > > if (is_journal_aborted(journal)) > - return 1; > + return -EIO; > > if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) > return 1; > @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) > if (journal->j_flags & JBD2_BARRIER) > blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); > > - __jbd2_update_log_tail(journal, first_tid, blocknr); > - return 0; > + return __jbd2_update_log_tail(journal, first_tid, blocknr); > } > > > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c > index b96bd80..6b33a42 100644 > --- a/fs/jbd2/journal.c > +++ b/fs/jbd2/journal.c > @@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, > * > * Requires j_checkpoint_mutex > */ > -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > { > unsigned long freed; > + int ret; > > BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); > > @@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > * space and if we lose sb update during power failure we'd replay > * old transaction with possibly newly overwritten data. > */ > - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); > + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); > + if (ret) > + goto out; > + > write_lock(&journal->j_state_lock); > freed = block - journal->j_tail; > if (block < journal->j_tail) > @@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > journal->j_tail_sequence = tid; > journal->j_tail = block; > write_unlock(&journal->j_state_lock); > + > +out: > + return ret; > } > > /* > @@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal) > return jbd2_journal_start_thread(journal); > } > > -static void jbd2_write_superblock(journal_t *journal, int write_op) > +static int jbd2_write_superblock(journal_t *journal, int write_op) > { > struct buffer_head *bh = journal->j_sb_buffer; > journal_superblock_t *sb = journal->j_superblock; > @@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) > printk(KERN_ERR "JBD2: Error %d detected when updating " > "journal superblock for %s.\n", ret, > journal->j_devname); > + jbd2_journal_abort(journal, ret); > } > + > + return ret; > } > > /** > @@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) > * Update a journal's superblock information about log tail and write it to > * disk, waiting for the IO to complete. > */ > -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > unsigned long tail_block, int write_op) > { > journal_superblock_t *sb = journal->j_superblock; > + int ret; > > BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); > jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", > @@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > sb->s_sequence = cpu_to_be32(tail_tid); > sb->s_start = cpu_to_be32(tail_block); > > - jbd2_write_superblock(journal, write_op); > + ret = jbd2_write_superblock(journal, write_op); > + if (ret) > + goto out; > > /* Log is no longer empty */ > write_lock(&journal->j_state_lock); > WARN_ON(!sb->s_sequence); > journal->j_flags &= ~JBD2_FLUSHED; > write_unlock(&journal->j_state_lock); > + > +out: > + return ret; > } > > /** > @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal) > return -EIO; > > mutex_lock(&journal->j_checkpoint_mutex); > - jbd2_cleanup_journal_tail(journal); > + if (!err) { > + err = jbd2_cleanup_journal_tail(journal); > + if (err < 0) { > + mutex_unlock(&journal->j_checkpoint_mutex); > + goto out; > + } > + } > > /* Finally, mark the journal as really needing no recovery. > * This sets s_start==0 in the underlying superblock, which is > @@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal) > J_ASSERT(journal->j_head == journal->j_tail); > J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); > write_unlock(&journal->j_state_lock); > - return 0; > +out: > + return err; > } > > /** > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h > index 20e7f78..edb640a 100644 > --- a/include/linux/jbd2.h > +++ b/include/linux/jbd2.h > @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); > int jbd2_journal_next_log_block(journal_t *, unsigned long long *); > int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, > unsigned long *block); > -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > > /* Commit management */ > @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); > extern int jbd2_journal_wipe (journal_t *, int); > extern int jbd2_journal_skip_recovery (journal_t *); > extern void jbd2_journal_update_sb_errno(journal_t *); > -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, > +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, > unsigned long, int); > extern void __jbd2_journal_abort_hard (journal_t *); > extern void jbd2_journal_abort (journal_t *, int); > -- > 1.8.4.3 > > -- Jan Kara <jack@xxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html