On Fri, Feb 10, 2017 at 05:08:54PM +1100, Neil Brown wrote: > On Tue, Feb 07 2017, Shaohua Li wrote: > > > Currently MD is rebusing some bio fields. To remove the hack, we attach > > extra data to each bio. Each personablity can attach extra data to the > > bios, so we don't need to rebuse bio fields. > > I must say that I don't really like this approach. > Temporarily modifying ->bi_private and ->bi_end_io seems > .... intrusive. I suspect it works, but I wonder if it is really > robust in the long term. > > How about a different approach.. Your main concern with my first patch > was that it called md_write_start() and md_write_end() much more often, > and these performed atomic ops on "global" variables, particular > writes_pending. > > We could change writes_pending to a per-cpu array which we only count > occasionally when needed. As writes_pending is updated often and > checked rarely, a per-cpu array which is summed on demand seems > appropriate. > > The following patch is an early draft - it doesn't obviously fail and > isn't obviously wrong to me. There is certainly room for improvement > and may be bugs. > Next week I'll work on collection the re-factoring into separate > patches, which are possible good-to-have anyway. For your first patch, I don't have much concern. It's ok to me. What I don't like is the bi_phys_segments handling part. The patches add a lot of logic to handle the reference count. They should work, but I'd say it's not easy to understand and could be error prone. What we really need is a reference count for the bio, so let's just add a reference count. That's my logic and it's simple. For the modifying bi_private and bi_end_io part, I saw some filesystems are using this way, at least btrfs. If this is really intrusive, is cloning a bio better? Thanks, Shaohua > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 8926fb781cdc..883c409902b0 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -64,6 +64,8 @@ > #include <linux/raid/md_p.h> > #include <linux/raid/md_u.h> > #include <linux/slab.h> > +#include <linux/percpu.h> > + > #include <trace/events/block.h> > #include "md.h" > #include "bitmap.h" > @@ -2250,6 +2252,81 @@ static void export_array(struct mddev *mddev) > mddev->major_version = 0; > } > > +/* > + * The percpu writes_pending counters are linked with ->checkers and ->lock. > + * If ->writes_pending can always be decremented without a lock. > + * It can only be incremented without a lock if ->checkers is 0 and the test+incr > + * happen in a rcu_readlocked region. > + * ->checkers can only be changed under ->lock protection. > + * To determine if ->writes_pending is totally zero, a quick sum without > + * locks can be performed. If this is non-zero, then the result is final. > + * Otherwise ->checkers must be incremented and synchronize_rcu() called. > + * Then a sum calculated under ->lock, and the result is final until the > + * ->checkers is decremented, or the lock is dropped. > + * > + */ > + > +static bool __writes_pending(struct mddev *mddev) > +{ > + long cnt = 0; > + int i; > + for_each_possible_cpu(i) > + cnt += *per_cpu_ptr(mddev->writes_pending_percpu, i); > + return cnt != 0; > +} > + > +static bool set_in_sync(struct mddev *mddev) > +{ > + > + WARN_ON_ONCE(!spin_is_locked(&mddev->lock)); > + if (__writes_pending(mddev)) { > + if (mddev->safemode == 1) > + mddev->safemode = 0; > + return false; > + } > + if (mddev->in_sync) > + return false; > + > + mddev->checkers ++; > + spin_unlock(&mddev->lock); > + synchronize_rcu(); > + spin_lock(&mddev->lock); > + if (mddev->in_sync) { > + /* someone else set it */ > + mddev->checkers --; > + return false; > + } > + > + if (! __writes_pending(mddev)) > + mddev->in_sync = 1; > + if (mddev->safemode == 1) > + mddev->safemode = 0; > + mddev->checkers --; > + return mddev->in_sync; > +} > + > +static void inc_writes_pending(struct mddev *mddev) > +{ > + rcu_read_lock(); > + if (mddev->checkers == 0) { > + __this_cpu_inc(*mddev->writes_pending_percpu); > + rcu_read_unlock(); > + return; > + } > + rcu_read_unlock(); > + /* Need that spinlock */ > + spin_lock(&mddev->lock); > + this_cpu_inc(*mddev->writes_pending_percpu); > + spin_unlock(&mddev->lock); > +} > + > +static void zero_writes_pending(struct mddev *mddev) > +{ > + int i; > + for_each_possible_cpu(i) > + *per_cpu_ptr(mddev->writes_pending_percpu, i) = 0; > +} > + > static void sync_sbs(struct mddev *mddev, int nospares) > { > /* Update each superblock (in-memory image), but > @@ -3583,7 +3660,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) > mddev->delta_disks = 0; > mddev->reshape_backwards = 0; > mddev->degraded = 0; > - spin_unlock(&mddev->lock); > > if (oldpers->sync_request == NULL && > mddev->external) { > @@ -3596,8 +3672,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) > */ > mddev->in_sync = 0; > mddev->safemode_delay = 0; > - mddev->safemode = 0; > } > + spin_unlock(&mddev->lock); > > oldpers->free(mddev, oldpriv); > > @@ -3963,16 +4039,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) > wake_up(&mddev->sb_wait); > err = 0; > } else /* st == clean */ { > + err = 0; > restart_array(mddev); > - if (atomic_read(&mddev->writes_pending) == 0) { > - if (mddev->in_sync == 0) { > - mddev->in_sync = 1; > - if (mddev->safemode == 1) > - mddev->safemode = 0; > - set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > - } > - err = 0; > - } else > + if (set_in_sync(mddev)) > + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > + else if (!mddev->in_sync) > err = -EBUSY; > } > if (!err) > @@ -4030,15 +4101,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) > if (err) > break; > spin_lock(&mddev->lock); > - if (atomic_read(&mddev->writes_pending) == 0) { > - if (mddev->in_sync == 0) { > - mddev->in_sync = 1; > - if (mddev->safemode == 1) > - mddev->safemode = 0; > - set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > - } > - err = 0; > - } else > + if (set_in_sync(mddev)) > + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > + else if (!mddev->in_sync) > err = -EBUSY; > spin_unlock(&mddev->lock); > } else > @@ -4993,6 +5058,9 @@ static void md_free(struct kobject *ko) > del_gendisk(mddev->gendisk); > put_disk(mddev->gendisk); > } > + if (mddev->writes_pending_percpu) { > + free_percpu(mddev->writes_pending_percpu); > + } > > kfree(mddev); > } > @@ -5069,6 +5137,13 @@ static int md_alloc(dev_t dev, char *name) > blk_queue_make_request(mddev->queue, md_make_request); > blk_set_stacking_limits(&mddev->queue->limits); > > + mddev->writes_pending_percpu = alloc_percpu(long); > + if (!mddev->writes_pending_percpu) { > + blk_cleanup_queue(mddev->queue); > + mddev->queue = NULL; > + goto abort; > + } > + > disk = alloc_disk(1 << shift); > if (!disk) { > blk_cleanup_queue(mddev->queue); > @@ -5152,7 +5227,7 @@ static void md_safemode_timeout(unsigned long data) > { > struct mddev *mddev = (struct mddev *) data; > > - if (!atomic_read(&mddev->writes_pending)) { > + if (!__writes_pending(mddev)) { > mddev->safemode = 1; > if (mddev->external) > sysfs_notify_dirent_safe(mddev->sysfs_state); > @@ -5358,7 +5433,7 @@ int md_run(struct mddev *mddev) > } else if (mddev->ro == 2) /* auto-readonly not meaningful */ > mddev->ro = 0; > > - atomic_set(&mddev->writes_pending,0); > + zero_writes_pending(mddev); > atomic_set(&mddev->max_corr_read_errors, > MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); > mddev->safemode = 0; > @@ -5451,7 +5526,6 @@ static int restart_array(struct mddev *mddev) > return -EINVAL; > } > > - mddev->safemode = 0; > mddev->ro = 0; > set_disk_ro(disk, 0); > pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); > @@ -7767,9 +7841,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) > md_wakeup_thread(mddev->sync_thread); > did_change = 1; > } > - atomic_inc(&mddev->writes_pending); > - if (mddev->safemode == 1) > - mddev->safemode = 0; > + inc_writes_pending(mddev); > if (mddev->in_sync) { > spin_lock(&mddev->lock); > if (mddev->in_sync) { > @@ -7790,12 +7862,17 @@ EXPORT_SYMBOL(md_write_start); > > void md_write_end(struct mddev *mddev) > { > - if (atomic_dec_and_test(&mddev->writes_pending)) { > - if (mddev->safemode == 2) > + this_cpu_dec(*mddev->writes_pending_percpu); > + if (mddev->safemode == 2) { > + if (!__writes_pending(mddev)) > md_wakeup_thread(mddev->thread); > - else if (mddev->safemode_delay) > - mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); > - } > + } else if (mddev->safemode_delay) > + /* The roundup() ensure this only performs locking once > + * every ->safemode_delay jiffies > + */ > + mod_timer(&mddev->safemode_timer, > + roundup(jiffies, mddev->safemode_delay) + mddev->safemode_delay); > + > } > EXPORT_SYMBOL(md_write_end); > > @@ -8398,7 +8475,7 @@ void md_check_recovery(struct mddev *mddev) > test_bit(MD_RECOVERY_DONE, &mddev->recovery) || > test_bit(MD_RELOAD_SB, &mddev->flags) || > (mddev->external == 0 && mddev->safemode == 1) || > - (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) > + (mddev->safemode == 2 && ! __writes_pending(mddev) > && !mddev->in_sync && mddev->recovery_cp == MaxSector) > )) > return; > @@ -8450,19 +8527,14 @@ void md_check_recovery(struct mddev *mddev) > md_reload_sb(mddev, mddev->good_device_nr); > } > > - if (!mddev->external) { > + if (!mddev->external && mddev->safemode) { > int did_change = 0; > spin_lock(&mddev->lock); > - if (mddev->safemode && > - !atomic_read(&mddev->writes_pending) && > - !mddev->in_sync && > - mddev->recovery_cp == MaxSector) { > - mddev->in_sync = 1; > + if (mddev->recovery_cp == MaxSector && > + set_in_sync(mddev)) { > did_change = 1; > set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > } > - if (mddev->safemode == 1) > - mddev->safemode = 0; > spin_unlock(&mddev->lock); > if (did_change) > sysfs_notify_dirent_safe(mddev->sysfs_state); > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 2a514036a83d..7e41f882d33d 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -404,7 +404,8 @@ struct mddev { > */ > unsigned int safemode_delay; > struct timer_list safemode_timer; > - atomic_t writes_pending; > + long *writes_pending_percpu; > + int checkers; /* # of threads checking writes_pending */ > struct request_queue *queue; /* for plugging ... */ > > struct bitmap *bitmap; /* the bitmap for the device */ > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index 3c7e106c12a2..45d260326efd 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -7360,7 +7360,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) > * we can't wait pending write here, as this is called in > * raid5d, wait will deadlock. > */ > - if (atomic_read(&mddev->writes_pending)) > + if (!mddev->in_sync) > return -EBUSY; > log = conf->log; > conf->log = NULL; -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html