On Mon 14-03-16 11:34:59, Daeho Jeong wrote: > Now, in ext4, there is a race condition between changing inode journal > mode and ext4_writepages(). While ext4_writepages() is executed on > a non-journalled mode inode, the inode's journal mode could be enabled > by ioctl() and then, some pages dirtied after switching the journal > mode will be still exposed to ext4_writepages() in non-journaled mode. > To resolve this problem, we use fs-wide per-cpu rw semaphore by > Jan Kara's suggestion because we don't want to waste ext4_inode_info's > space for this extra rare case. > > Signed-off-by: Daeho Jeong <daeho.jeong@xxxxxxxxxxx> > Signed-off-by: Jan Kara <jack@xxxxxxx> Yeah, this patch is fine now. Thanks! Honza > --- > fs/ext4/ext4.h | 4 ++++ > fs/ext4/inode.c | 15 ++++++++++++--- > fs/ext4/super.c | 4 ++++ > kernel/locking/percpu-rwsem.c | 1 + > 4 files changed, 21 insertions(+), 3 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 157b458..c757a3d 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -33,6 +33,7 @@ > #include <linux/ratelimit.h> > #include <crypto/hash.h> > #include <linux/falloc.h> > +#include <linux/percpu-rwsem.h> > #ifdef __KERNEL__ > #include <linux/compat.h> > #endif > @@ -1475,6 +1476,9 @@ struct ext4_sb_info { > struct ratelimit_state s_err_ratelimit_state; > struct ratelimit_state s_warning_ratelimit_state; > struct ratelimit_state s_msg_ratelimit_state; > + > + /* Barrier between changing inodes' journal flags and writepages ops. */ > + struct percpu_rw_semaphore s_journal_flag_rwsem; > }; > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 71fab4c..4f45f24 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -2476,11 +2476,14 @@ static int ext4_writepages(struct address_space *mapping, > struct blk_plug plug; > bool give_up_on_write = false; > > + percpu_down_read(&sbi->s_journal_flag_rwsem); > trace_ext4_writepages(inode, wbc); > > - if (dax_mapping(mapping)) > - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, > - wbc); > + if (dax_mapping(mapping)) { > + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, > + wbc); > + goto out_writepages; > + } > > /* > * No pages to write? This is mainly a kludge to avoid starting > @@ -2650,6 +2653,7 @@ retry: > out_writepages: > trace_ext4_writepages_result(inode, wbc, ret, > nr_to_write - wbc->nr_to_write); > + percpu_up_read(&sbi->s_journal_flag_rwsem); > return ret; > } > > @@ -5366,6 +5370,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > journal_t *journal; > handle_t *handle; > int err; > + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > > /* > * We have to be very careful here: changing a data block's > @@ -5405,6 +5410,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > } > } > > + percpu_down_write(&sbi->s_journal_flag_rwsem); > jbd2_journal_lock_updates(journal); > > /* > @@ -5421,6 +5427,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > err = jbd2_journal_flush(journal); > if (err < 0) { > jbd2_journal_unlock_updates(journal); > + percpu_up_write(&sbi->s_journal_flag_rwsem); > ext4_inode_resume_unlocked_dio(inode); > return err; > } > @@ -5429,6 +5436,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) > ext4_set_aops(inode); > > jbd2_journal_unlock_updates(journal); > + percpu_up_write(&sbi->s_journal_flag_rwsem); > + > if (val) > up_write(&EXT4_I(inode)->i_mmap_sem); > ext4_inode_resume_unlocked_dio(inode); > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 3ed01ec..a12950d 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -861,6 +861,7 @@ static void ext4_put_super(struct super_block *sb) > percpu_counter_destroy(&sbi->s_freeinodes_counter); > percpu_counter_destroy(&sbi->s_dirs_counter); > percpu_counter_destroy(&sbi->s_dirtyclusters_counter); > + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); > brelse(sbi->s_sbh); > #ifdef CONFIG_QUOTA > for (i = 0; i < EXT4_MAXQUOTAS; i++) > @@ -3926,6 +3927,9 @@ no_journal: > if (!err) > err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, > GFP_KERNEL); > + if (!err) > + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); > + > if (err) { > ext4_msg(sb, KERN_ERR, "insufficient memory"); > goto failed_mount6; > diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c > index f231e0b..bec0b64 100644 > --- a/kernel/locking/percpu-rwsem.c > +++ b/kernel/locking/percpu-rwsem.c > @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) > free_percpu(brw->fast_read_ctr); > brw->fast_read_ctr = NULL; /* catch use after free bugs */ > } > +EXPORT_SYMBOL_GPL(percpu_free_rwsem); > > /* > * This is the fast-path for down_read/up_read. If it succeeds we rely > -- > 1.7.9.5 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html