On Tue 11-08-15 19:04:19, Oleg Nesterov wrote: > We can remove everything from struct sb_writers except frozen > and add the array of percpu_rw_semaphore's instead. > > This patch doesn't remove sb_writers->wait_unfrozen yet, we keep > it for get_super_thawed(). We will probably remove it later. > > This change tries to address the following problems: > > - Firstly, __sb_start_write() looks simply buggy. It does > __sb_end_write() if it sees ->frozen, but if it migrates > to another CPU before percpu_counter_dec(), sb_wait_write() > can wrongly succeed if there is another task which holds > the same "semaphore": sb_wait_write() can miss the result > of the previous percpu_counter_inc() but see the result > of this percpu_counter_dec(). > > - As Dave Hansen reports, it is suboptimal. The trivial > microbenchmark that writes to a tmpfs file in a loop runs > 12% faster if we change this code to rely on RCU and kill > the memory barriers. > > - This code doesn't look simple. It would be better to rely > on the generic locking code. > > According to Dave, this change adds the same performance > improvement. > > Note: with this change both freeze_super() and thaw_super() will do > synchronize_sched_expedited() 3 times. This is just ugly. But: > > - This will be "fixed" by the rcu_sync changes we are going > to merge. After that freeze_super()->percpu_down_write() > will use synchronize_sched(), and thaw_super() won't use > synchronize() at all. > > This doesn't need any changes in fs/super.c. > > - Once we merge rcu_sync changes, we can also change super.c > so that all wb_write->rw_sem's will share the single ->rss > in struct sb_writes, then freeze_super() will need only one > synchronize_sched(). > > Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx> The patch looks good. You can add: Reviewed-by: Jan Kara <jack@xxxxxxxx> Honza > --- > fs/super.c | 107 ++++++++++++++-------------------------------------- > include/linux/fs.h | 19 +++------ > 2 files changed, 35 insertions(+), 91 deletions(-) > > diff --git a/fs/super.c b/fs/super.c > index 75436e2..8762997 100644 > --- a/fs/super.c > +++ b/fs/super.c > @@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work) > int i; > > for (i = 0; i < SB_FREEZE_LEVELS; i++) > - percpu_counter_destroy(&s->s_writers.counter[i]); > + percpu_free_rwsem(&s->s_writers.rw_sem[i]); > kfree(s); > } > > @@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) > goto fail; > > for (i = 0; i < SB_FREEZE_LEVELS; i++) { > - if (percpu_counter_init(&s->s_writers.counter[i], 0, > - GFP_KERNEL) < 0) > + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], > + sb_writers_name[i], > + &type->s_writers_key[i])) > goto fail; > - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], > - &type->s_writers_key[i], 0); > } > - init_waitqueue_head(&s->s_writers.wait); > init_waitqueue_head(&s->s_writers.wait_unfrozen); > s->s_bdi = &noop_backing_dev_info; > s->s_flags = flags; > @@ -1161,47 +1159,10 @@ out: > */ > void __sb_end_write(struct super_block *sb, int level) > { > - percpu_counter_dec(&sb->s_writers.counter[level-1]); > - /* > - * Make sure s_writers are updated before we wake up waiters in > - * freeze_super(). > - */ > - smp_mb(); > - if (waitqueue_active(&sb->s_writers.wait)) > - wake_up(&sb->s_writers.wait); > - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); > + percpu_up_read(sb->s_writers.rw_sem + level-1); > } > EXPORT_SYMBOL(__sb_end_write); > > -static int do_sb_start_write(struct super_block *sb, int level, bool wait, > - unsigned long ip) > -{ > - if (wait) > - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); > -retry: > - if (unlikely(sb->s_writers.frozen >= level)) { > - if (!wait) > - return 0; > - wait_event(sb->s_writers.wait_unfrozen, > - sb->s_writers.frozen < level); > - } > - > - percpu_counter_inc(&sb->s_writers.counter[level-1]); > - /* > - * Make sure counter is updated before we check for frozen. > - * freeze_super() first sets frozen and then checks the counter. > - */ > - smp_mb(); > - if (unlikely(sb->s_writers.frozen >= level)) { > - __sb_end_write(sb, level); > - goto retry; > - } > - > - if (!wait) > - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); > - return 1; > -} > - > /* > * This is an internal function, please use sb_start_{write,pagefault,intwrite} > * instead. > @@ -1209,7 +1170,7 @@ retry: > int __sb_start_write(struct super_block *sb, int level, bool wait) > { > bool force_trylock = false; > - int ret; > + int ret = 1; > > #ifdef CONFIG_LOCKDEP > /* > @@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) > int i; > > for (i = 0; i < level - 1; i++) > - if (lock_is_held(&sb->s_writers.lock_map[i])) { > + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { > force_trylock = true; > break; > } > } > #endif > - ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); > + if (wait && !force_trylock) > + percpu_down_read(sb->s_writers.rw_sem + level-1); > + else > + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); > + > WARN_ON(force_trylock & !ret); > return ret; > } > @@ -1249,9 +1214,7 @@ EXPORT_SYMBOL(__sb_start_write); > */ > static void sb_wait_write(struct super_block *sb, int level) > { > - s64 writers; > - > - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); > + percpu_down_write(sb->s_writers.rw_sem + level-1); > /* > * We are going to return to userspace and forget about this lock, the > * ownership goes to the caller of thaw_super() which does unlock. > @@ -1262,24 +1225,18 @@ static void sb_wait_write(struct super_block *sb, int level) > * this leads to lockdep false-positives, so currently we do the early > * release right after acquire. > */ > - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); > - > - do { > - DEFINE_WAIT(wait); > + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); > +} > > - /* > - * We use a barrier in prepare_to_wait() to separate setting > - * of frozen and checking of the counter > - */ > - prepare_to_wait(&sb->s_writers.wait, &wait, > - TASK_UNINTERRUPTIBLE); > +static void sb_freeze_unlock(struct super_block *sb) > +{ > + int level; > > - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); > - if (writers) > - schedule(); > + for (level = 0; level < SB_FREEZE_LEVELS; ++level) > + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); > > - finish_wait(&sb->s_writers.wait, &wait); > - } while (writers); > + for (level = SB_FREEZE_LEVELS; --level >= 0; ) > + percpu_up_write(sb->s_writers.rw_sem + level); > } > > /** > @@ -1338,20 +1295,14 @@ int freeze_super(struct super_block *sb) > return 0; > } > > - /* From now on, no new normal writers can start */ > sb->s_writers.frozen = SB_FREEZE_WRITE; > - smp_wmb(); > - > /* Release s_umount to preserve sb_start_write -> s_umount ordering */ > up_write(&sb->s_umount); > - > sb_wait_write(sb, SB_FREEZE_WRITE); > + down_write(&sb->s_umount); > > /* Now we go and block page faults... */ > - down_write(&sb->s_umount); > sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; > - smp_wmb(); > - > sb_wait_write(sb, SB_FREEZE_PAGEFAULT); > > /* All writers are done so after syncing there won't be dirty data */ > @@ -1359,7 +1310,6 @@ int freeze_super(struct super_block *sb) > > /* Now wait for internal filesystem counter */ > sb->s_writers.frozen = SB_FREEZE_FS; > - smp_wmb(); > sb_wait_write(sb, SB_FREEZE_FS); > > if (sb->s_op->freeze_fs) { > @@ -1368,7 +1318,7 @@ int freeze_super(struct super_block *sb) > printk(KERN_ERR > "VFS:Filesystem freeze failed\n"); > sb->s_writers.frozen = SB_UNFROZEN; > - smp_wmb(); > + sb_freeze_unlock(sb); > wake_up(&sb->s_writers.wait_unfrozen); > deactivate_locked_super(sb); > return ret; > @@ -1400,8 +1350,10 @@ int thaw_super(struct super_block *sb) > return -EINVAL; > } > > - if (sb->s_flags & MS_RDONLY) > + if (sb->s_flags & MS_RDONLY) { > + sb->s_writers.frozen = SB_UNFROZEN; > goto out; > + } > > if (sb->s_op->unfreeze_fs) { > error = sb->s_op->unfreeze_fs(sb); > @@ -1413,12 +1365,11 @@ int thaw_super(struct super_block *sb) > } > } > > -out: > sb->s_writers.frozen = SB_UNFROZEN; > - smp_wmb(); > + sb_freeze_unlock(sb); > +out: > wake_up(&sb->s_writers.wait_unfrozen); > deactivate_locked_super(sb); > - > return 0; > } > EXPORT_SYMBOL(thaw_super); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 6addccc..fb2cb4a 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1,7 +1,6 @@ > #ifndef _LINUX_FS_H > #define _LINUX_FS_H > > - > #include <linux/linkage.h> > #include <linux/wait.h> > #include <linux/kdev_t.h> > @@ -31,6 +30,7 @@ > #include <linux/percpu-rwsem.h> > #include <linux/blk_types.h> > #include <linux/workqueue.h> > +#include <linux/percpu-rwsem.h> > > #include <asm/byteorder.h> > #include <uapi/linux/fs.h> > @@ -1247,16 +1247,9 @@ enum { > #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) > > struct sb_writers { > - /* Counters for counting writers at each level */ > - struct percpu_counter counter[SB_FREEZE_LEVELS]; > - wait_queue_head_t wait; /* queue for waiting for > - writers / faults to finish */ > - int frozen; /* Is sb frozen? */ > - wait_queue_head_t wait_unfrozen; /* queue for waiting for > - sb to be thawed */ > -#ifdef CONFIG_DEBUG_LOCK_ALLOC > - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; > -#endif > + int frozen; /* Is sb frozen? */ > + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ > + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; > }; > > struct super_block { > @@ -1364,9 +1357,9 @@ void __sb_end_write(struct super_block *sb, int level); > int __sb_start_write(struct super_block *sb, int level, bool wait); > > #define __sb_acquire_write(sb, lev) \ > - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) > + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) > #define __sb_release_write(sb, lev) \ > - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) > + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) > > /** > * sb_end_write - drop write access to a superblock > -- > 1.5.5.1 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html