On Fri 03-05-13 22:16:13, Zheng Liu wrote: > On Mon, Apr 08, 2013 at 11:32:12PM +0200, Jan Kara wrote: > > Currently when we add a buffer to a transaction, we wait until the > > buffer is removed from BJ_Shadow list (so that we prevent any changes to > > the buffer that is just written to the journal). This can take > > unnecessarily long as a lot happens between the time the buffer is > > submitted to the journal and the time when we remove the buffer from > > BJ_Shadow list (e.g. we wait for all data buffers in the transaction, > > we issue a cache flush etc.). Also this creates a dependency of > > do_get_write_access() on transaction commit (namely waiting for data IO > > to complete) which we want to avoid when implementing transaction > > reservation. > > > > So we modify commit code to set new BH_Shadow flag when temporary > > shadowing buffer is created and we clear that flag once IO on that > > buffer is complete. This allows do_get_write_access() to wait only for > > BH_Shadow bit and thus removes the dependency on data IO completion. > > > > Signed-off-by: Jan Kara <jack@xxxxxxx> > > A minor nit below. Otherwise the patch looks good to me. > Reviewed-by: Zheng Liu <wenqing.lz@xxxxxxxxxx> Thanks for review! I'll fix that typo. Honza > > > --- > > fs/jbd2/commit.c | 20 ++++++++++---------- > > fs/jbd2/journal.c | 2 ++ > > fs/jbd2/transaction.c | 44 +++++++++++++++++++------------------------- > > include/linux/jbd.h | 25 +++++++++++++++++++++++++ > > include/linux/jbd2.h | 28 ++++++++++++++++++++++++++++ > > include/linux/jbd_common.h | 26 -------------------------- > > 6 files changed, 84 insertions(+), 61 deletions(-) > > > > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c > > index 1a03762..4863f5b 100644 > > --- a/fs/jbd2/commit.c > > +++ b/fs/jbd2/commit.c > > @@ -30,15 +30,22 @@ > > #include <trace/events/jbd2.h> > > > > /* > > - * Default IO end handler for temporary BJ_IO buffer_heads. > > + * IO end handler for temporary buffer_heads handling writes to the journal. > > */ > > static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) > > { > > + struct buffer_head *orig_bh = bh->b_private; > > + > > BUFFER_TRACE(bh, ""); > > if (uptodate) > > set_buffer_uptodate(bh); > > else > > clear_buffer_uptodate(bh); > > + if (orig_bh) { > > + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); > > + smp_mb__after_clear_bit(); > > + wake_up_bit(&orig_bh->b_state, BH_Shadow); > > + } > > unlock_buffer(bh); > > } > > > > @@ -818,7 +825,7 @@ start_journal_io: > > jbd2_unfile_log_bh(bh); > > > > /* > > - * The list contains temporary buffer heads created by > > + * The list contains temporary buffer heas created by > ^^^^ > typo: head > > Regards, > - Zheng > > > * jbd2_journal_write_metadata_buffer(). > > */ > > BUFFER_TRACE(bh, "dumping temporary bh"); > > @@ -831,6 +838,7 @@ start_journal_io: > > bh = jh2bh(jh); > > clear_buffer_jwrite(bh); > > J_ASSERT_BH(bh, buffer_jbddirty(bh)); > > + J_ASSERT_BH(bh, !buffer_shadow(bh)); > > > > /* The metadata is now released for reuse, but we need > > to remember it against this transaction so that when > > @@ -838,14 +846,6 @@ start_journal_io: > > required. */ > > JBUFFER_TRACE(jh, "file as BJ_Forget"); > > jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); > > - /* > > - * Wake up any transactions which were waiting for this IO to > > - * complete. The barrier must be here so that changes by > > - * jbd2_journal_file_buffer() take effect before wake_up_bit() > > - * does the waitqueue check. > > - */ > > - smp_mb(); > > - wake_up_bit(&bh->b_state, BH_Unshadow); > > JBUFFER_TRACE(jh, "brelse shadowed buffer"); > > __brelse(bh); > > } > > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c > > index e03aae0..e9a9cdb 100644 > > --- a/fs/jbd2/journal.c > > +++ b/fs/jbd2/journal.c > > @@ -453,6 +453,7 @@ repeat: > > new_bh->b_size = bh_in->b_size; > > new_bh->b_bdev = journal->j_dev; > > new_bh->b_blocknr = blocknr; > > + new_bh->b_private = bh_in; > > set_buffer_mapped(new_bh); > > set_buffer_dirty(new_bh); > > > > @@ -467,6 +468,7 @@ repeat: > > spin_lock(&journal->j_list_lock); > > __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); > > spin_unlock(&journal->j_list_lock); > > + set_buffer_shadow(bh_in); > > jbd_unlock_bh_state(bh_in); > > > > return do_escape | (done_copy_out << 1); > > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c > > index bc35899..81df09c 100644 > > --- a/fs/jbd2/transaction.c > > +++ b/fs/jbd2/transaction.c > > @@ -620,6 +620,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) > > bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); > > } > > > > +static int sleep_on_shadow_bh(void *word) > > +{ > > + io_schedule(); > > + return 0; > > +} > > + > > /* > > * If the buffer is already part of the current transaction, then there > > * is nothing we need to do. If it is already part of a prior > > @@ -747,41 +753,29 @@ repeat: > > * journaled. If the primary copy is already going to > > * disk then we cannot do copy-out here. */ > > > > - if (jh->b_jlist == BJ_Shadow) { > > - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); > > - wait_queue_head_t *wqh; > > - > > - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); > > - > > + if (buffer_shadow(bh)) { > > JBUFFER_TRACE(jh, "on shadow: sleep"); > > jbd_unlock_bh_state(bh); > > - /* commit wakes up all shadow buffers after IO */ > > - for ( ; ; ) { > > - prepare_to_wait(wqh, &wait.wait, > > - TASK_UNINTERRUPTIBLE); > > - if (jh->b_jlist != BJ_Shadow) > > - break; > > - schedule(); > > - } > > - finish_wait(wqh, &wait.wait); > > + wait_on_bit(&bh->b_state, BH_Shadow, > > + sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); > > goto repeat; > > } > > > > - /* Only do the copy if the currently-owning transaction > > - * still needs it. If it is on the Forget list, the > > - * committing transaction is past that stage. The > > - * buffer had better remain locked during the kmalloc, > > - * but that should be true --- we hold the journal lock > > - * still and the buffer is already on the BUF_JOURNAL > > - * list so won't be flushed. > > + /* > > + * Only do the copy if the currently-owning transaction still > > + * needs it. If buffer isn't on BJ_Metadata list, the > > + * committing transaction is past that stage (here we use the > > + * fact that BH_Shadow is set under bh_state lock together with > > + * refiling to BJ_Shadow list and at this point we know the > > + * buffer doesn't have BH_Shadow set). > > * > > * Subtle point, though: if this is a get_undo_access, > > * then we will be relying on the frozen_data to contain > > * the new value of the committed_data record after the > > * transaction, so we HAVE to force the frozen_data copy > > - * in that case. */ > > - > > - if (jh->b_jlist != BJ_Forget || force_copy) { > > + * in that case. > > + */ > > + if (jh->b_jlist == BJ_Metadata || force_copy) { > > JBUFFER_TRACE(jh, "generate frozen data"); > > if (!frozen_buffer) { > > JBUFFER_TRACE(jh, "allocate memory for buffer"); > > diff --git a/include/linux/jbd.h b/include/linux/jbd.h > > index c8f3297..1c36b0c 100644 > > --- a/include/linux/jbd.h > > +++ b/include/linux/jbd.h > > @@ -244,6 +244,31 @@ typedef struct journal_superblock_s > > > > #include <linux/fs.h> > > #include <linux/sched.h> > > + > > +enum jbd_state_bits { > > + BH_JBD /* Has an attached ext3 journal_head */ > > + = BH_PrivateStart, > > + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ > > + BH_Freed, /* Has been freed (truncated) */ > > + BH_Revoked, /* Has been revoked from the log */ > > + BH_RevokeValid, /* Revoked flag is valid */ > > + BH_JBDDirty, /* Is dirty but journaled */ > > + BH_State, /* Pins most journal_head state */ > > + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ > > + BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ > > + BH_JBDPrivateStart, /* First bit available for private use by FS */ > > +}; > > + > > +BUFFER_FNS(JBD, jbd) > > +BUFFER_FNS(JWrite, jwrite) > > +BUFFER_FNS(JBDDirty, jbddirty) > > +TAS_BUFFER_FNS(JBDDirty, jbddirty) > > +BUFFER_FNS(Revoked, revoked) > > +TAS_BUFFER_FNS(Revoked, revoked) > > +BUFFER_FNS(RevokeValid, revokevalid) > > +TAS_BUFFER_FNS(RevokeValid, revokevalid) > > +BUFFER_FNS(Freed, freed) > > + > > #include <linux/jbd_common.h> > > > > #define J_ASSERT(assert) BUG_ON(!(assert)) > > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h > > index 4584518..be5115f 100644 > > --- a/include/linux/jbd2.h > > +++ b/include/linux/jbd2.h > > @@ -302,6 +302,34 @@ typedef struct journal_superblock_s > > > > #include <linux/fs.h> > > #include <linux/sched.h> > > + > > +enum jbd_state_bits { > > + BH_JBD /* Has an attached ext3 journal_head */ > > + = BH_PrivateStart, > > + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ > > + BH_Freed, /* Has been freed (truncated) */ > > + BH_Revoked, /* Has been revoked from the log */ > > + BH_RevokeValid, /* Revoked flag is valid */ > > + BH_JBDDirty, /* Is dirty but journaled */ > > + BH_State, /* Pins most journal_head state */ > > + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ > > + BH_Shadow, /* IO on shadow buffer is running */ > > + BH_Verified, /* Metadata block has been verified ok */ > > + BH_JBDPrivateStart, /* First bit available for private use by FS */ > > +}; > > + > > +BUFFER_FNS(JBD, jbd) > > +BUFFER_FNS(JWrite, jwrite) > > +BUFFER_FNS(JBDDirty, jbddirty) > > +TAS_BUFFER_FNS(JBDDirty, jbddirty) > > +BUFFER_FNS(Revoked, revoked) > > +TAS_BUFFER_FNS(Revoked, revoked) > > +BUFFER_FNS(RevokeValid, revokevalid) > > +TAS_BUFFER_FNS(RevokeValid, revokevalid) > > +BUFFER_FNS(Freed, freed) > > +BUFFER_FNS(Shadow, shadow) > > +BUFFER_FNS(Verified, verified) > > + > > #include <linux/jbd_common.h> > > > > #define J_ASSERT(assert) BUG_ON(!(assert)) > > diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h > > index 6133679..b1f7089 100644 > > --- a/include/linux/jbd_common.h > > +++ b/include/linux/jbd_common.h > > @@ -1,32 +1,6 @@ > > #ifndef _LINUX_JBD_STATE_H > > #define _LINUX_JBD_STATE_H > > > > -enum jbd_state_bits { > > - BH_JBD /* Has an attached ext3 journal_head */ > > - = BH_PrivateStart, > > - BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ > > - BH_Freed, /* Has been freed (truncated) */ > > - BH_Revoked, /* Has been revoked from the log */ > > - BH_RevokeValid, /* Revoked flag is valid */ > > - BH_JBDDirty, /* Is dirty but journaled */ > > - BH_State, /* Pins most journal_head state */ > > - BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ > > - BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ > > - BH_Verified, /* Metadata block has been verified ok */ > > - BH_JBDPrivateStart, /* First bit available for private use by FS */ > > -}; > > - > > -BUFFER_FNS(JBD, jbd) > > -BUFFER_FNS(JWrite, jwrite) > > -BUFFER_FNS(JBDDirty, jbddirty) > > -TAS_BUFFER_FNS(JBDDirty, jbddirty) > > -BUFFER_FNS(Revoked, revoked) > > -TAS_BUFFER_FNS(Revoked, revoked) > > -BUFFER_FNS(RevokeValid, revokevalid) > > -TAS_BUFFER_FNS(RevokeValid, revokevalid) > > -BUFFER_FNS(Freed, freed) > > -BUFFER_FNS(Verified, verified) > > - > > static inline struct buffer_head *jh2bh(struct journal_head *jh) > > { > > return jh->b_bh; > > -- > > 1.7.1 > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Jan Kara <jack@xxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html