Most of the patch is based on Coly's patch for md/raid1 fd76863e37fe(RAID1: a new I/O barrier implementation to remove resync window) and several other patches related to barrier implementation in raid1, such as: 07169fd478ed raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array 43ac9b84a399 md/raid1: Use a new variable to count flighting sync requests f6eca2d43ed6 raid1: prevent freeze_array/wait_all_barriers deadlock I have done some basic tests with fio. The test script is like: [global] bsrange=4-16k iodepth=2 size=1G numjobs=1 [read] ioengine=sync rw=read fsync=1 [write] ioengine=psync rw=write fsync=1 [mixed] ioengine=sync rw=rw fsync=1 [randmixed] ioengine=libaio rw=randrw fsync=1 There is no problem for normal IO operation and disk plug/unplug while fio running in my test. But there are some other scenarios need to be tested in the future. I am sending this version patch for your comments first before I finish all tests in my mind. Signed-off-by: Lidong Zhong <lzhong@xxxxxxxx> --- drivers/md/raid1-10.c | 7 ++ drivers/md/raid1.h | 5 - drivers/md/raid10.c | 311 ++++++++++++++++++++++++++++++++++++-------------- drivers/md/raid10.h | 36 +++++- 4 files changed, 262 insertions(+), 97 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 400001b815db..6d2ee38b0a15 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -80,3 +80,10 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + +/* sector mapped to bucket */ +static inline int sector_to_idx(sector_t sector) +{ + return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, + BARRIER_BUCKETS_NR_BITS); +} diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index eb84bc68e2fd..f8f5d2886dc1 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -210,9 +210,4 @@ enum r1bio_state { R1BIO_FailFast, }; -static inline int sector_to_idx(sector_t sector) -{ - return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, - BARRIER_BUCKETS_NR_BITS); -} #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c5e6c60fc0d4..19b5d91f3f2e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -97,8 +97,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r10conf *conf); -static void lower_barrier(struct r10conf *conf); +static void allow_barrier(struct r10conf *conf, sector_t sector_nr); +static void lower_barrier(struct r10conf *conf, sector_t sector_nr); static int _enough(struct r10conf *conf, int previous, int ignore); static int enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, @@ -297,10 +297,11 @@ static void free_r10bio(struct r10bio *r10_bio) static void put_buf(struct r10bio *r10_bio) { struct r10conf *conf = r10_bio->mddev->private; + sector_t sect = r10_bio->sector; mempool_free(r10_bio, conf->r10buf_pool); - lower_barrier(conf); + lower_barrier(conf, sect); } static void reschedule_retry(struct r10bio *r10_bio) @@ -308,10 +309,12 @@ static void reschedule_retry(struct r10bio *r10_bio) unsigned long flags; struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; + int idx; + idx = sector_to_idx(r10_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); /* wake up frozen array... */ @@ -329,6 +332,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) { struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; + sector_t sect = bio->bi_iter.bi_sector; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; @@ -338,7 +342,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, sect); free_r10bio(r10_bio); } @@ -959,71 +963,177 @@ static void flush_pending_writes(struct r10conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ - -static void raise_barrier(struct r10conf *conf, int force) +static void raise_barrier(struct r10conf *conf, int force, sector_t sector_nr) { - BUG_ON(force && !conf->barrier); + int idx = sector_to_idx(sector_nr); + spin_lock_irq(&conf->resync_lock); - /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, + /* Wait until no block IO is waiting */ + wait_event_lock_irq(conf->wait_barrier, + force || !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier++; - - /* Now wait for all pending IO to complete */ + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); + + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. + */ wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); + atomic_inc(&conf->nr_sync_pending); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(struct r10conf *conf) +static void lower_barrier(struct r10conf *conf, sector_t sector_nr) { - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + int idx = sector_to_idx(sector_nr); + + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); + + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_sync_pending); wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r10conf *conf) +static void __wait_barrier(struct r10conf *conf, int idx) { + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); + + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * freeze_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; + + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ - raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), - conf->resync_lock); - conf->nr_waiting--; - if (!conf->nr_waiting) - wake_up(&conf->wait_barrier); - } - atomic_inc(&conf->nr_pending); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); } -static void allow_barrier(struct r10conf *conf) +static void wait_barrier(struct r10conf *conf, sector_t sector_nr) { - if ((atomic_dec_and_test(&conf->nr_pending)) || - (conf->array_freeze_pending)) - wake_up(&conf->wait_barrier); + int idx = sector_to_idx(sector_nr); + __wait_barrier(conf, idx); +} + +static void wait_read_barrier(struct r10conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); + + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); +} + +static void __allow_barrier(struct r10conf *conf, int idx) +{ + atomic_dec(&conf->nr_pending[idx]); + wake_up(&conf->wait_barrier); +} + +static void allow_barrier(struct r10conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + __allow_barrier(conf, idx); +} + +static int get_unqueued_pending(struct r10conf *conf) +{ + int ret, idx; + + ret = atomic_read(&conf->nr_sync_pending); + for(idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); + + return ret; } static void freeze_array(struct r10conf *conf, int extra) @@ -1041,15 +1151,12 @@ static void freeze_array(struct r10conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->array_freeze_pending++; - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, + get_unqueued_pending(conf) == extra, conf->resync_lock, flush_pending_writes(conf)); - conf->array_freeze_pending--; spin_unlock_irq(&conf->resync_lock); } @@ -1057,8 +1164,7 @@ static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -1168,7 +1274,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - wait_barrier(conf); + wait_read_barrier(conf, bio->bi_iter.bi_sector); sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -1179,12 +1285,12 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * pass */ raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); + allow_barrier(conf, bio->bi_iter.bi_sector); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + sectors); - wait_barrier(conf); + wait_read_barrier(conf, bio->bi_iter.bi_sector); } rdev = read_balance(conf, r10_bio, &max_sectors); @@ -1312,6 +1418,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; + sector_t sect = bio->bi_iter.bi_sector; if ((mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1334,7 +1441,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - wait_barrier(conf); + wait_barrier(conf, sect); sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -1345,12 +1452,12 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * pass */ raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); + allow_barrier(conf, sect); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + sectors); - wait_barrier(conf); + wait_barrier(conf, sect); } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -1497,10 +1604,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, rdev_dec_pending(rdev, mddev); } } - allow_barrier(conf); + allow_barrier(conf, sect); raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + wait_barrier(conf, sect); goto retry_write; } @@ -1729,8 +1836,11 @@ static void print_conf(struct r10conf *conf) static void close_sync(struct r10conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + int idx; + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + __wait_barrier(conf, idx); + __allow_barrier(conf, idx); + } mempool_destroy(conf->r10buf_pool); conf->r10buf_pool = NULL; @@ -2634,7 +2744,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); - allow_barrier(conf); + allow_barrier(conf, bio->bi_iter.bi_sector); r10_bio->state = 0; raid10_read_request(mddev, r10_bio->master_bio, r10_bio); } @@ -2647,7 +2757,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) * Or possibly if failed and we need to record * a bad block. */ - int m; + int m, idx; struct md_rdev *rdev; if (test_bit(R10BIO_IsSync, &r10_bio->state) || @@ -2721,9 +2831,10 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } if (fail) { + idx = sector_to_idx(r10_bio->sector); spin_lock_irq(&conf->device_lock); list_add(&r10_bio->retry_list, &conf->bio_end_io_list); - conf->nr_queued++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irq(&conf->device_lock); /* * In case freeze_array() is waiting for condition @@ -2748,6 +2859,7 @@ static void raid10d(struct md_thread *thread) struct r10conf *conf = mddev->private; struct list_head *head = &conf->retry_list; struct blk_plug plug; + int idx; md_check_recovery(mddev); @@ -2755,17 +2867,15 @@ static void raid10d(struct md_thread *thread) !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); - if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - while (!list_empty(&conf->bio_end_io_list)) { - list_move(conf->bio_end_io_list.prev, &tmp); - conf->nr_queued--; - } - } + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + list_splice_init(&conf->bio_end_io_list, &tmp); spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { r10_bio = list_first_entry(&tmp, struct r10bio, retry_list); list_del(&r10_bio->retry_list); + idx = sector_to_idx(r10_bio->sector); + atomic_dec(&conf->nr_queued[idx]); if (mddev->degraded) set_bit(R10BIO_Degraded, &r10_bio->state); @@ -2788,7 +2898,8 @@ static void raid10d(struct md_thread *thread) } r10_bio = list_entry(head->prev, struct r10bio, retry_list); list_del(head->prev); - conf->nr_queued--; + idx = sector_to_idx(r10_bio->sector); + atomic_dec(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; @@ -2937,7 +3048,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct r10bio *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; - int i; + int i, idx; int max_sync; sector_t sync_blocks; sector_t sectors_skipped = 0; @@ -3048,7 +3159,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting) + idx = sector_to_idx(sector_nr); + if (atomic_read(&conf->nr_waiting[idx])) schedule_timeout_uninterruptible(1); /* Again, very different code for resync and recovery. @@ -3132,7 +3244,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; - raise_barrier(conf, rb2 != NULL); + raise_barrier(conf, rb2 != NULL, sect); atomic_set(&r10_bio->remaining, 0); r10_bio->master_bio = (struct bio*)rb2; @@ -3354,7 +3466,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->mddev = mddev; atomic_set(&r10_bio->remaining, 0); - raise_barrier(conf, 0); + raise_barrier(conf, 0, sector_nr); conf->next_resync = sector_nr; r10_bio->master_bio = NULL; @@ -3693,6 +3805,27 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->mirrors) goto out; + /*barriers*/ + conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if(!conf->nr_pending) + goto out; + + conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if(!conf->nr_waiting) + goto out; + + conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if(!conf->nr_queued) + goto out; + + conf->barrier = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if(!conf->barrier) + goto out; + conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) goto out; @@ -3731,7 +3864,6 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); - atomic_set(&conf->nr_pending, 0); conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) @@ -3745,6 +3877,10 @@ static struct r10conf *setup_conf(struct mddev *mddev) mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); if (conf->bio_split) bioset_free(conf->bio_split); kfree(conf); @@ -3981,9 +4117,9 @@ static void raid10_quiesce(struct mddev *mddev, int quiesce) struct r10conf *conf = mddev->private; if (quiesce) - raise_barrier(conf, 0); + freeze_array(conf, 0); else - lower_barrier(conf); + unfreeze_array(conf); } static int raid10_resize(struct mddev *mddev, sector_t sectors) @@ -4061,7 +4197,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; } return conf; @@ -4505,7 +4642,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, if (need_flush || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Need to update reshape_position in metadata */ - wait_barrier(conf); + wait_barrier(conf, sector_nr); mddev->reshape_position = conf->reshape_progress; if (mddev->reshape_backwards) mddev->curr_resync_completed = raid10_size(mddev, 0, 0) @@ -4518,18 +4655,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, wait_event(mddev->sb_wait, mddev->sb_flags == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - allow_barrier(conf); + allow_barrier(conf, sector_nr); return sectors_done; } conf->reshape_safe = mddev->reshape_position; - allow_barrier(conf); + allow_barrier(conf, sector_nr); } read_more: /* Now schedule reads for blocks from sector_nr to last */ r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; - raise_barrier(conf, sectors_done != 0); + raise_barrier(conf, sectors_done != 0, sector_nr); atomic_set(&r10_bio->remaining, 0); r10_bio->mddev = mddev; r10_bio->sector = sector_nr; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index e2e8840de9bf..cac22e0bd14f 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -2,6 +2,31 @@ #ifndef _RAID10_H #define _RAID10_H +/* + * This part is copied from the implementation in raid1 + * each barrier unit size is 64MB fow now + * note: it must be larger than RESYNC_DEPTH + */ +#define BARRIER_UNIT_SECTOR_BITS 17 +#define BARRIER_UNIT_SECTOR_SIZE (1<<17) +/* + * In struct r10conf, the following members are related to I/O barrier + * buckets, + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. + */ +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) +#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) + /* Note: raid10_info.rdev can be set to NULL asynchronously by * raid10_remove_disk. * There are three safe ways to access raid10_info.rdev. @@ -78,11 +103,12 @@ struct r10conf { int pending_count; spinlock_t resync_lock; - atomic_t nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - int array_freeze_pending; + atomic_t nr_sync_pending; + atomic_t *nr_pending; + atomic_t *nr_waiting; + atomic_t *nr_queued; + atomic_t *barrier; + int array_frozen; sector_t next_resync; int fullsync; /* set to 1 if a full sync is needed, * (fresh device added). -- 2.13.6 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html