On Wed, Dec 12, 2012 at 09:50:16PM +0100, Jan Kara wrote: > We cannot wait for transaction commit in journal_unmap_buffer() because we hold > page lock which ranks below transaction start. We solve the issue by bailing > out of journal_unmap_buffer() and jbd2_journal_invalidatepage() with -EBUSY. ocfs2 also calls jbd2_journal_invalidatepage(). I would think we would need to apply a similar fix up to ocfs2, lest they end up having jbd2_journal_invalidatepage() silently failing for them. I'm going to hold off on this patch until we're sure it's not going to cause problems for ocfs2. A quick check indicates that they also support sub-page block sizes, which would tend to indicate that they could get hit by the same dead lock, yes? - Ted > Caller is then responsible for waiting for transaction commit to finish and try > invalidation again. Since the issue can happen only for page stradding i_size, > it is simple enough to manually call jbd2_journal_invalidatepage() for such > page from ext4_setattr(), check the return value and wait if necessary. > > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/ext4/inode.c | 82 +++++++++++++++++++++++++++++++++++++------ > fs/jbd2/transaction.c | 27 +++++++------- > include/linux/jbd2.h | 2 +- > include/trace/events/ext4.h | 4 +- > 4 files changed, 88 insertions(+), 27 deletions(-) > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 66abac7..cc0abeb 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -2813,8 +2813,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) > block_invalidatepage(page, offset); > } > > -static void ext4_journalled_invalidatepage(struct page *page, > - unsigned long offset) > +static int __ext4_journalled_invalidatepage(struct page *page, > + unsigned long offset) > { > journal_t *journal = EXT4_JOURNAL(page->mapping->host); > > @@ -2826,7 +2826,14 @@ static void ext4_journalled_invalidatepage(struct page *page, > if (offset == 0) > ClearPageChecked(page); > > - jbd2_journal_invalidatepage(journal, page, offset); > + return jbd2_journal_invalidatepage(journal, page, offset); > +} > + > +/* Wrapper for aops... */ > +static void ext4_journalled_invalidatepage(struct page *page, > + unsigned long offset) > +{ > + WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); > } > > static int ext4_releasepage(struct page *page, gfp_t wait) > @@ -4230,6 +4237,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) > } > > /* > + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate > + * buffers that are attached to a page stradding i_size and are undergoing > + * commit. In that case we have to wait for commit to finish and try again. > + */ > +static void ext4_wait_for_tail_page_commit(struct inode *inode) > +{ > + struct page *page; > + unsigned offset; > + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; > + tid_t commit_tid = 0; > + int ret; > + > + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); > + /* > + * All buffers in the last page remain valid? Then there's nothing to > + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == > + * blocksize case > + */ > + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) > + return; > + while (1) { > + page = find_lock_page(inode->i_mapping, > + inode->i_size >> PAGE_CACHE_SHIFT); > + if (!page) > + return; > + ret = __ext4_journalled_invalidatepage(page, offset); > + unlock_page(page); > + page_cache_release(page); > + if (ret != -EBUSY) > + return; > + commit_tid = 0; > + read_lock(&journal->j_state_lock); > + if (journal->j_committing_transaction) > + commit_tid = journal->j_committing_transaction->t_tid; > + read_unlock(&journal->j_state_lock); > + if (commit_tid) > + jbd2_log_wait_commit(journal, commit_tid); > + } > +} > + > +/* > * ext4_setattr() > * > * Called from notify_change. > @@ -4342,16 +4390,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) > } > > if (attr->ia_valid & ATTR_SIZE) { > - if (attr->ia_size != i_size_read(inode)) { > - truncate_setsize(inode, attr->ia_size); > - /* Inode size will be reduced, wait for dio in flight. > - * Temporarily disable dioread_nolock to prevent > - * livelock. */ > + if (attr->ia_size != inode->i_size) { > + loff_t oldsize = inode->i_size; > + > + i_size_write(inode, attr->ia_size); > + /* > + * Blocks are going to be removed from the inode. Wait > + * for dio in flight. Temporarily disable > + * dioread_nolock to prevent livelock. > + */ > if (orphan) { > - ext4_inode_block_unlocked_dio(inode); > - inode_dio_wait(inode); > - ext4_inode_resume_unlocked_dio(inode); > + if (!ext4_should_journal_data(inode)) { > + ext4_inode_block_unlocked_dio(inode); > + inode_dio_wait(inode); > + ext4_inode_resume_unlocked_dio(inode); > + } else > + ext4_wait_for_tail_page_commit(inode); > } > + /* > + * Truncate pagecache after we've waited for commit > + * in data=journal mode to make pages freeable. > + */ > + truncate_pagecache(inode, oldsize, inode->i_size); > } > ext4_truncate(inode); > } > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c > index a74ba46..e1475b4 100644 > --- a/fs/jbd2/transaction.c > +++ b/fs/jbd2/transaction.c > @@ -1850,7 +1850,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, > > BUFFER_TRACE(bh, "entry"); > > -retry: > /* > * It is safe to proceed here without the j_list_lock because the > * buffers cannot be stolen by try_to_free_buffers as long as we are > @@ -1945,14 +1944,11 @@ retry: > * for commit and try again. > */ > if (partial_page) { > - tid_t tid = journal->j_committing_transaction->t_tid; > - > jbd2_journal_put_journal_head(jh); > spin_unlock(&journal->j_list_lock); > jbd_unlock_bh_state(bh); > write_unlock(&journal->j_state_lock); > - jbd2_log_wait_commit(journal, tid); > - goto retry; > + return -EBUSY; > } > /* > * OK, buffer won't be reachable after truncate. We just set > @@ -2013,21 +2009,23 @@ zap_buffer_unlocked: > * @page: page to flush > * @offset: length of page to invalidate. > * > - * Reap page buffers containing data after offset in page. > - * > + * Reap page buffers containing data after offset in page. Can return -EBUSY > + * if buffers are part of the committing transaction and the page is straddling > + * i_size. Caller then has to wait for current commit and try again. > */ > -void jbd2_journal_invalidatepage(journal_t *journal, > - struct page *page, > - unsigned long offset) > +int jbd2_journal_invalidatepage(journal_t *journal, > + struct page *page, > + unsigned long offset) > { > struct buffer_head *head, *bh, *next; > unsigned int curr_off = 0; > int may_free = 1; > + int ret = 0; > > if (!PageLocked(page)) > BUG(); > if (!page_has_buffers(page)) > - return; > + return 0; > > /* We will potentially be playing with lists other than just the > * data lists (especially for journaled data mode), so be > @@ -2041,9 +2039,11 @@ void jbd2_journal_invalidatepage(journal_t *journal, > if (offset <= curr_off) { > /* This block is wholly outside the truncation point */ > lock_buffer(bh); > - may_free &= journal_unmap_buffer(journal, bh, > - offset > 0); > + ret = journal_unmap_buffer(journal, bh, offset > 0); > unlock_buffer(bh); > + if (ret < 0) > + return ret; > + may_free &= ret; > } > curr_off = next_off; > bh = next; > @@ -2054,6 +2054,7 @@ void jbd2_journal_invalidatepage(journal_t *journal, > if (may_free && try_to_free_buffers(page)) > J_ASSERT(!page_has_buffers(page)); > } > + return 0; > } > > /* > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h > index 3efc43f..cb7d6af 100644 > --- a/include/linux/jbd2.h > +++ b/include/linux/jbd2.h > @@ -1099,7 +1099,7 @@ extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); > extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); > extern int jbd2_journal_forget (handle_t *, struct buffer_head *); > extern void journal_sync_buffer (struct buffer_head *); > -extern void jbd2_journal_invalidatepage(journal_t *, > +extern int jbd2_journal_invalidatepage(journal_t *, > struct page *, unsigned long); > extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); > extern int jbd2_journal_stop(handle_t *); > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h > index df972d9..3ef522e 100644 > --- a/include/trace/events/ext4.h > +++ b/include/trace/events/ext4.h > @@ -476,13 +476,13 @@ DECLARE_EVENT_CLASS(ext4_invalidatepage_op, > (unsigned long) __entry->index, __entry->offset) > ); > > -DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage > +DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, > TP_PROTO(struct page *page, unsigned long offset), > > TP_ARGS(page, offset) > ); > > -DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage > +DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, > TP_PROTO(struct page *page, unsigned long offset), > > TP_ARGS(page, offset) > -- > 1.7.1 > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html